From ba1afa13722f8e38310abb68ee0246fda96b839a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Miguel=20=C3=81ngel?= <miguel.sierra@heygen.com>
Date: Sat, 20 Jun 2026 16:21:47 -0400
Subject: [PATCH 1/4] fix(engine): copy mixed AAC during MP4 mux

---
 .../engine/src/services/chunkEncoder.test.ts  | 111 ++++++++++++++++++
 packages/engine/src/services/chunkEncoder.ts  |  22 +++-
 .../src/services/distributed/assemble.test.ts |   6 +-
 3 files changed, 135 insertions(+), 4 deletions(-)

diff --git a/packages/engine/src/services/chunkEncoder.test.ts b/packages/engine/src/services/chunkEncoder.test.ts
index 5b0091639a..b358fe57c7 100644
--- a/packages/engine/src/services/chunkEncoder.test.ts
+++ b/packages/engine/src/services/chunkEncoder.test.ts
@@ -355,6 +355,117 @@ describe("encodeFramesChunkedConcat ffmpegEncodeTimeout", () => {
   });
 });
 
+describe("muxVideoWithAudio audio codec handling", () => {
+  it("copies HyperFrames AAC sidecars into MP4 instead of re-encoding", async () => {
+    const { spawn, calls } = createSpawnSpy();
+    vi.resetModules();
+    vi.doMock("child_process", () => ({ spawn }));
+
+    const { muxVideoWithAudio } = await import("./chunkEncoder.js");
+    const muxPromise = muxVideoWithAudio(
+      "/tmp/video-only.mp4",
+      "/tmp/audio.aac",
+      "/tmp/output.mp4",
+      undefined,
+      undefined,
+      { num: 30, den: 1 },
+    );
+
+    expect(calls).toHaveLength(1);
+    expect(calls[0]!.args).toEqual([
+      "-i",
+      "/tmp/video-only.mp4",
+      "-i",
+      "/tmp/audio.aac",
+      "-c:v",
+      "copy",
+      "-c:a",
+      "copy",
+      "-movflags",
+      "+faststart",
+      "-avoid_negative_ts",
+      "make_zero",
+      "-r",
+      "30",
+      "-shortest",
+      "-y",
+      "/tmp/output.mp4",
+    ]);
+    expect(calls[0]!.args).not.toContain("-use_editlist");
+
+    emitClose(calls[0]!.proc, 0);
+    await expect(muxPromise).resolves.toMatchObject({
+      success: true,
+      outputPath: "/tmp/output.mp4",
+    });
+  });
+
+  it("still transcodes non-AAC audio when muxing MP4", async () => {
+    const { spawn, calls } = createSpawnSpy();
+    vi.resetModules();
+    vi.doMock("child_process", () => ({ spawn }));
+
+    const { muxVideoWithAudio } = await import("./chunkEncoder.js");
+    const muxPromise = muxVideoWithAudio(
+      "/tmp/video-only.mp4",
+      "/tmp/audio.wav",
+      "/tmp/output.mp4",
+    );
+
+    expect(calls).toHaveLength(1);
+    expect(calls[0]!.args).toContain("-c:a");
+    expect(calls[0]!.args[calls[0]!.args.indexOf("-c:a") + 1]).toBe("aac");
+    expect(calls[0]!.args).toContain("-b:a");
+    expect(calls[0]!.args).toContain("+faststart");
+
+    emitClose(calls[0]!.proc, 0);
+    await expect(muxPromise).resolves.toMatchObject({ success: true });
+  });
+
+  it("copies HyperFrames AAC sidecars into MOV containers", async () => {
+    const { spawn, calls } = createSpawnSpy();
+    vi.resetModules();
+    vi.doMock("child_process", () => ({ spawn }));
+
+    const { muxVideoWithAudio } = await import("./chunkEncoder.js");
+    const muxPromise = muxVideoWithAudio(
+      "/tmp/video-only.mov",
+      "/tmp/audio.aac",
+      "/tmp/output.mov",
+    );
+
+    expect(calls).toHaveLength(1);
+    expect(calls[0]!.args).toContain("-c:a");
+    expect(calls[0]!.args[calls[0]!.args.indexOf("-c:a") + 1]).toBe("copy");
+    expect(calls[0]!.args).not.toContain("-b:a");
+    expect(calls[0]!.args).not.toContain("+faststart");
+
+    emitClose(calls[0]!.proc, 0);
+    await expect(muxPromise).resolves.toMatchObject({ success: true });
+  });
+
+  it("keeps WebM audio on the Opus transcode path", async () => {
+    const { spawn, calls } = createSpawnSpy();
+    vi.resetModules();
+    vi.doMock("child_process", () => ({ spawn }));
+
+    const { muxVideoWithAudio } = await import("./chunkEncoder.js");
+    const muxPromise = muxVideoWithAudio(
+      "/tmp/video-only.webm",
+      "/tmp/audio.aac",
+      "/tmp/output.webm",
+    );
+
+    expect(calls).toHaveLength(1);
+    expect(calls[0]!.args).toContain("-c:a");
+    expect(calls[0]!.args[calls[0]!.args.indexOf("-c:a") + 1]).toBe("libopus");
+    expect(calls[0]!.args).not.toContain("+faststart");
+
+    emitClose(calls[0]!.proc, 0);
+    await expect(muxPromise).resolves.toMatchObject({ success: true });
+  });
+});
+
 describe("getEncoderPreset", () => {
   it("returns h264 with yuv420p for mp4 format", () => {
     const preset = getEncoderPreset("standard", "mp4");
diff --git a/packages/engine/src/services/chunkEncoder.ts b/packages/engine/src/services/chunkEncoder.ts
index fb11da4680..7df4ff03fd 100644
--- a/packages/engine/src/services/chunkEncoder.ts
+++ b/packages/engine/src/services/chunkEncoder.ts
@@ -8,7 +8,7 @@
 
 import { spawn } from "child_process";
 import { copyFileSync, existsSync, mkdirSync, readdirSync, statSync, writeFileSync } from "fs";
-import { join, dirname } from "path";
+import { join, dirname, extname } from "path";
 import { trackChildProcess } from "../utils/processTracker.js";
 import { DEFAULT_CONFIG, type EngineConfig } from "../config.js";
 import {
@@ -44,6 +44,10 @@ function appendEncodeTimeoutMessage(error: string, timedOut: boolean, timeoutMs:
   return `${error}\nFFmpeg killed after exceeding ffmpegEncodeTimeout (${timeoutMs} ms)`;
 }
 
+function isAacSidecar(audioPath: string): boolean {
+  return extname(audioPath).toLowerCase() === ".aac";
+}
+
 /**
  * Get encoder preset for a given quality and output format.
  * WebM uses VP9 with alpha-capable pixel format; MP4 uses h264 (or h265 for HDR);
@@ -703,9 +707,21 @@ export async function muxVideoWithAudio(
   if (isWebm) {
     args.push("-c:a", "libopus", "-b:a", "128k");
   } else if (isMov) {
-    args.push("-c:a", "aac", "-b:a", "192k");
+    if (isAacSidecar(audioPath)) {
+      args.push("-c:a", "copy");
+    } else {
+      args.push("-c:a", "aac", "-b:a", "192k");
+    }
   } else {
-    args.push("-c:a", "aac", "-b:a", "192k", "-movflags", "+faststart");
+    // HyperFrames' audio mixer already writes an AAC sidecar. Re-encoding
+    // that AAC during MP4 mux adds a second encoder-priming interval; ffmpeg
+    // preserves the gap as an empty video edit list, which QuickTime/Safari
+    // render as a black first frame. Copy the mixed sidecar instead.
+    if (isAacSidecar(audioPath)) {
+      args.push("-c:a", "copy", "-movflags", "+faststart");
+    } else {
+      args.push("-c:a", "aac", "-b:a", "192k", "-movflags", "+faststart");
+    }
   }
   // PTS bases can diverge during mux and reintroduce negative DTS. See
   // buildEncoderArgs for the full reasoning on why that breaks playback.
diff --git a/packages/producer/src/services/distributed/assemble.test.ts b/packages/producer/src/services/distributed/assemble.test.ts
index 74599f8e4a..a12d644173 100644
--- a/packages/producer/src/services/distributed/assemble.test.ts
+++ b/packages/producer/src/services/distributed/assemble.test.ts
@@ -146,7 +146,7 @@ function probeStream(
       "-select_streams",
       streamSelector,
       "-show_entries",
-      "stream=duration,nb_frames,nb_read_packets,codec_name,r_frame_rate",
+      "stream=start_time,duration,nb_frames,nb_read_packets,codec_name,r_frame_rate",
       "-count_packets",
       "-of",
       "json",
@@ -316,6 +316,10 @@ describe("assemble()", () => {
       const audioStream = probeStream(outputPath, "a:0");
       expect(audioStream).toBeDefined();
       expect(audioStream?.codec_name).toBe("aac");
+      const videoStream = probeStream(outputPath, "v:0");
+      expect(videoStream).toBeDefined();
+      expect(Number(videoStream?.start_time ?? NaN)).toBeLessThan(0.001);
+      expect(Number(audioStream?.start_time ?? NaN)).toBeLessThan(0.001);
       // Audio duration should be within ~25ms of `totalFrames / fps` after
       // pad/trim. The 25ms tolerance absorbs AAC frame quantization (1024
       // samples @ 48kHz = ~21ms).

From 7ef576949499353e7077bcff3a3d1a24206328d1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Miguel=20=C3=81ngel?= <miguel.sierra@heygen.com>
Date: Sat, 20 Jun 2026 16:41:29 -0400
Subject: [PATCH 2/4] fix(producer): avoid AAC re-encode in distributed audio
 pad

---
 .../engine/src/services/chunkEncoder.test.ts  |  30 ++-
 packages/engine/src/services/chunkEncoder.ts  |  31 +++-
 .../src/services/distributed/assemble.test.ts |  42 +++++
 .../src/services/distributed/assemble.ts      |   2 +-
 .../src/services/render/audioPadTrim.test.ts  |  68 +++++--
 .../src/services/render/audioPadTrim.ts       | 173 +++++++++++++++---
 .../services/render/stages/assembleStage.ts   |   2 +-
 7 files changed, 291 insertions(+), 57 deletions(-)

diff --git a/packages/engine/src/services/chunkEncoder.test.ts b/packages/engine/src/services/chunkEncoder.test.ts
index b358fe57c7..fe6b1672f2 100644
--- a/packages/engine/src/services/chunkEncoder.test.ts
+++ b/packages/engine/src/services/chunkEncoder.test.ts
@@ -400,6 +400,34 @@ describe("muxVideoWithAudio audio codec handling", () => {
     });
   });
 
+  it("uses the caller-provided AAC codec contract instead of the sidecar extension", async () => {
+    const { spawn, calls } = createSpawnSpy();
+    vi.resetModules();
+    vi.doMock("child_process", () => ({ spawn }));
+
+    const { muxVideoWithAudio } = await import("./chunkEncoder.js");
+    const muxPromise = muxVideoWithAudio(
+      "/tmp/video-only.mp4",
+      "/tmp/audio-sidecar",
+      "/tmp/output.mp4",
+      undefined,
+      { audioCodec: "aac" },
+      { num: 30, den: 1 },
+    );
+
+    expect(calls).toHaveLength(1);
+    expect(calls[0]!.args).toContain("-c:a");
+    expect(calls[0]!.args[calls[0]!.args.indexOf("-c:a") + 1]).toBe("copy");
+    expect(calls[0]!.args).not.toContain("-b:a");
+    expect(calls[0]!.args).toContain("+faststart");
+
+    emitClose(calls[0]!.proc, 0);
+    await expect(muxPromise).resolves.toMatchObject({
+      success: true,
+      outputPath: "/tmp/output.mp4",
+    });
+  });
+
   it("still transcodes non-AAC audio when muxing MP4", async () => {
     const { spawn, calls } = createSpawnSpy();
     vi.resetModules();
@@ -422,7 +450,7 @@ describe("muxVideoWithAudio audio codec handling", () => {
     await expect(muxPromise).resolves.toMatchObject({ success: true });
   });
 
-  it("copies HyperFrames AAC sidecars into MOV containers", async () => {
+  it("copies HyperFrames AAC sidecars into MOV containers without MP4 faststart flags", async () => {
     const { spawn, calls } = createSpawnSpy();
     vi.resetModules();
     vi.doMock("child_process", () => ({ spawn }));
diff --git a/packages/engine/src/services/chunkEncoder.ts b/packages/engine/src/services/chunkEncoder.ts
index 7df4ff03fd..18ecabd50e 100644
--- a/packages/engine/src/services/chunkEncoder.ts
+++ b/packages/engine/src/services/chunkEncoder.ts
@@ -48,6 +48,21 @@ function isAacSidecar(audioPath: string): boolean {
   return extname(audioPath).toLowerCase() === ".aac";
 }
 
+export interface MuxVideoWithAudioOptions extends Partial<
+  Pick<EngineConfig, "ffmpegProcessTimeout">
+> {
+  /**
+   * Codec of the sidecar audio when the caller already knows it. HyperFrames
+   * render paths pass the mixed AAC sidecar by contract, so muxing should not
+   * depend on the file extension alone.
+   */
+  audioCodec?: "aac";
+}
+
+function shouldCopyAacSidecar(audioPath: string, options: MuxVideoWithAudioOptions | undefined) {
+  return options?.audioCodec === "aac" || isAacSidecar(audioPath);
+}
+
 /**
  * Get encoder preset for a given quality and output format.
  * WebM uses VP9 with alpha-capable pixel format; MP4 uses h264 (or h265 for HDR);
@@ -694,7 +709,7 @@ export async function muxVideoWithAudio(
   audioPath: string,
   outputPath: string,
   signal?: AbortSignal,
-  config?: Partial<Pick<EngineConfig, "ffmpegProcessTimeout">>,
+  config?: MuxVideoWithAudioOptions,
   fps?: Fps,
 ): Promise<MuxResult> {
   const outputDir = dirname(outputPath);
@@ -702,22 +717,24 @@ export async function muxVideoWithAudio(
 
   const isWebm = outputPath.endsWith(".webm");
   const isMov = outputPath.endsWith(".mov");
+  const shouldCopyAudio = shouldCopyAacSidecar(audioPath, config);
   const args = ["-i", videoPath, "-i", audioPath, "-c:v", "copy"];
 
   if (isWebm) {
     args.push("-c:a", "libopus", "-b:a", "128k");
   } else if (isMov) {
-    if (isAacSidecar(audioPath)) {
+    if (shouldCopyAudio) {
       args.push("-c:a", "copy");
     } else {
       args.push("-c:a", "aac", "-b:a", "192k");
     }
   } else {
-    // HyperFrames' audio mixer already writes an AAC sidecar. Re-encoding
-    // that AAC during MP4 mux adds a second encoder-priming interval; ffmpeg
-    // preserves the gap as an empty video edit list, which QuickTime/Safari
-    // render as a black first frame. Copy the mixed sidecar instead.
-    if (isAacSidecar(audioPath)) {
+    // processCompositionAudio (audioMixer.ts) performs the AAC encode and
+    // owns the single encoder-priming interval. Copying that sidecar into
+    // MP4 preserves the correct priming metadata; re-encoding it during mux
+    // creates another priming interval that ffmpeg writes as an empty leading
+    // video edit list, which QuickTime/Safari render as a black first frame.
+    if (shouldCopyAudio) {
       args.push("-c:a", "copy", "-movflags", "+faststart");
     } else {
       args.push("-c:a", "aac", "-b:a", "192k", "-movflags", "+faststart");
diff --git a/packages/producer/src/services/distributed/assemble.test.ts b/packages/producer/src/services/distributed/assemble.test.ts
index a12d644173..4cb662097a 100644
--- a/packages/producer/src/services/distributed/assemble.test.ts
+++ b/packages/producer/src/services/distributed/assemble.test.ts
@@ -330,6 +330,48 @@ describe("assemble()", () => {
     TIMEOUT_MS,
   );
 
+  it(
+    "muxes padded short audio without shifting the first video frame",
+    async () => {
+      if (!hasFfmpeg) return;
+
+      const chunks: ChunkSliceJson[] = [
+        { index: 0, startFrame: 0, endFrame: 6 },
+        { index: 1, startFrame: 6, endFrame: 12 },
+      ];
+      const totalFrames = 12;
+      const fps = 30;
+      const planDir = buildPlanDir("mp4", chunks, totalFrames, true);
+
+      const chunkAPath = join(planDir, "chunk-0.mp4");
+      const chunkBPath = join(planDir, "chunk-1.mp4");
+      const audioPath = join(planDir, "audio.aac");
+      makeMp4Chunk(chunkAPath, 6);
+      makeMp4Chunk(chunkBPath, 6);
+      // Audio is shorter than the video, forcing the distributed pad branch.
+      makeAacAudio(audioPath, totalFrames / fps - 0.2);
+
+      const outputPath = join(planDir, "output-audio-padded.mp4");
+      const result = await assemble(planDir, [chunkAPath, chunkBPath], audioPath, outputPath);
+
+      expect(existsSync(outputPath)).toBe(true);
+      expect(result.framesEncoded).toBe(totalFrames);
+
+      const audioStream = probeStream(outputPath, "a:0");
+      expect(audioStream).toBeDefined();
+      expect(audioStream?.codec_name).toBe("aac");
+      const videoStream = probeStream(outputPath, "v:0");
+      expect(videoStream).toBeDefined();
+      expect(Number(videoStream?.start_time ?? NaN)).toBeLessThan(0.001);
+      expect(Number(audioStream?.start_time ?? NaN)).toBeLessThan(0.001);
+
+      const audioDuration = Number(audioStream?.duration ?? 0);
+      const expected = totalFrames / fps;
+      expect(Math.abs(audioDuration - expected)).toBeLessThan(0.05);
+    },
+    TIMEOUT_MS,
+  );
+
   it(
     "cfr:true re-encodes for exact avg_frame_rate matching r_frame_rate",
     async () => {
diff --git a/packages/producer/src/services/distributed/assemble.ts b/packages/producer/src/services/distributed/assemble.ts
index ec6eb98f1d..747240f034 100644
--- a/packages/producer/src/services/distributed/assemble.ts
+++ b/packages/producer/src/services/distributed/assemble.ts
@@ -320,7 +320,7 @@ export async function assemble(
         audioForMux,
         muxOutputPath,
         abortSignal,
-        undefined,
+        { audioCodec: "aac" },
         { num: plan.dimensions.fpsNum, den: plan.dimensions.fpsDen },
       );
       if (!muxResult.success) {
diff --git a/packages/producer/src/services/render/audioPadTrim.test.ts b/packages/producer/src/services/render/audioPadTrim.test.ts
index 3a73fd1d51..821537a65c 100644
--- a/packages/producer/src/services/render/audioPadTrim.test.ts
+++ b/packages/producer/src/services/render/audioPadTrim.test.ts
@@ -16,6 +16,7 @@
 import { describe, expect, it } from "bun:test";
 import {
   buildPadTrimAudioArgs,
+  buildPadTrimAudioPlan,
   padOrTrimAudioToVideoFrameCount,
   type AudioProbeInfo,
   type PadTrimAudioInput,
@@ -23,18 +24,47 @@ import {
 } from "./audioPadTrim.js";
 
 describe("buildPadTrimAudioArgs", () => {
-  it("emits an apad filter when audio is shorter than target", () => {
+  it("emits a concat-copy pad plan when audio is shorter than target", () => {
+    const plan = buildPadTrimAudioPlan("/tmp/in.aac", "/tmp/out.aac", 4.0, 5.0, {
+      sampleRate: 48000,
+      channels: 2,
+    });
+    expect(plan.operation).toBe("pad");
+    expect(plan.steps).toHaveLength(2);
+
+    const silenceArgs = plan.steps[0]!.args;
+    expect(plan.steps[0]!.kind).toBe("pad-silence");
+    expect(silenceArgs).not.toContain("/tmp/in.aac");
+    expect(silenceArgs[silenceArgs.indexOf("-i") + 1]).toBe(
+      "anullsrc=channel_layout=stereo:sample_rate=48000",
+    );
+    expect(silenceArgs[silenceArgs.indexOf("-t") + 1]).toBe("1.000000");
+    expect(silenceArgs[silenceArgs.indexOf("-c:a") + 1]).toBe("aac");
+
+    const concatArgs = plan.steps[1]!.args;
+    expect(plan.steps[1]!.kind).toBe("pad-concat");
+    expect(concatArgs).toContain("concat");
+    expect(concatArgs[concatArgs.indexOf("-c:a") + 1]).toBe("copy");
+    expect(concatArgs[concatArgs.length - 1]).toBe("/tmp/out.aac");
+    expect(plan.concatList?.contents).toContain("file '/tmp/in.aac'");
+    expect(plan.concatList?.contents).toContain("file '/tmp/out.aac.pad-silence.aac'");
+    expect(plan.cleanupPaths).toEqual([
+      "/tmp/out.aac.pad-silence.aac",
+      "/tmp/out.aac.pad-concat.txt",
+    ]);
+
+    const reencodedSourceStep = plan.steps.find(
+      (step) =>
+        step.args.includes("/tmp/in.aac") && step.args[step.args.indexOf("-c:a") + 1] === "aac",
+    );
+    expect(reencodedSourceStep).toBeUndefined();
+  });
+
+  it("keeps the legacy args helper on the first pad materialization step", () => {
     const { args, operation } = buildPadTrimAudioArgs("/tmp/in.aac", "/tmp/out.aac", 4.0, 5.0);
     expect(operation).toBe("pad");
-    const afIdx = args.indexOf("-af");
-    expect(afIdx).toBeGreaterThan(-1);
-    expect(args[afIdx + 1]).toContain("apad=pad_dur=");
-    expect(args[afIdx + 1]).toMatch(/pad_dur=1\.0+/);
-    // Pad must re-encode — apad is a filter and filters can't combine with copy.
-    const codecIdx = args.indexOf("-c:a");
-    expect(args[codecIdx + 1]).toBe("aac");
-    expect(args[args.length - 1]).toBe("/tmp/out.aac");
-    expect(args.includes("-y")).toBe(true);
+    expect(args).not.toContain("/tmp/in.aac");
+    expect(args[args.indexOf("-t") + 1]).toBe("1.000000");
   });
 
   it("emits -t when audio is longer than target", () => {
@@ -57,14 +87,14 @@ describe("buildPadTrimAudioArgs", () => {
     expect(args[codecIdx + 1]).toBe("copy");
   });
 
-  it("emits 6-decimal-place pad_dur (no scientific notation)", () => {
+  it("emits 6-decimal-place pad duration (no scientific notation)", () => {
     // 1.23ms — just over the AUDIO_DURATION_TOLERANCE_SECONDS=1ms threshold,
     // so we exercise the pad path with a tiny duration that would round to
     // exponent notation if we used `toString()` instead of `toFixed(6)`.
     const { args, operation } = buildPadTrimAudioArgs("/tmp/in.aac", "/tmp/out.aac", 0.0, 0.00123);
     expect(operation).toBe("pad");
-    const afIdx = args.indexOf("-af");
-    expect(args[afIdx + 1]).toBe("apad=pad_dur=0.001230");
+    const tIdx = args.indexOf("-t");
+    expect(args[tIdx + 1]).toBe("0.001230");
   });
 
   it("flags ~1ms drift as a copy (below the tolerance threshold)", () => {
@@ -120,9 +150,11 @@ describe("padOrTrimAudioToVideoFrameCount", () => {
     expect(result.operation).toBe("pad");
     expect(result.targetDurationSeconds).toBe(6);
     expect(result.sourceDurationSeconds).toBe(5.5);
-    expect(captured.args).toHaveLength(1);
-    const afIdx = captured.args[0]!.indexOf("-af");
-    expect(captured.args[0]![afIdx + 1]).toBe("apad=pad_dur=0.500000");
+    expect(captured.args).toHaveLength(2);
+    const tIdx = captured.args[0]!.indexOf("-t");
+    expect(captured.args[0]![tIdx + 1]).toBe("0.500000");
+    expect(captured.args[0]).not.toContain("/tmp/a.aac");
+    expect(captured.args[1]![captured.args[1]!.indexOf("-c:a") + 1]).toBe("copy");
   });
 
   it("trims a video of N=120 frames at 30/1 fps with longer audio", async () => {
@@ -162,8 +194,8 @@ describe("padOrTrimAudioToVideoFrameCount", () => {
     expect(result.success).toBe(true);
     expect(result.operation).toBe("pad");
     expect(result.targetDurationSeconds).toBeCloseTo((120 * 1001) / 30000, 9);
-    const afIdx = captured.args[0]!.indexOf("-af");
-    expect(captured.args[0]![afIdx + 1]).toMatch(/^apad=pad_dur=0\.004\d+$/);
+    const tIdx = captured.args[0]!.indexOf("-t");
+    expect(captured.args[0]![tIdx + 1]).toMatch(/^0\.004\d+$/);
   });
 
   it("propagates video probe failure as success=false", async () => {
diff --git a/packages/producer/src/services/render/audioPadTrim.ts b/packages/producer/src/services/render/audioPadTrim.ts
index 9f124f5060..e94008ee04 100644
--- a/packages/producer/src/services/render/audioPadTrim.ts
+++ b/packages/producer/src/services/render/audioPadTrim.ts
@@ -14,10 +14,12 @@
  *     "audio cuts off early" or "video shows a frozen final frame" bugs.
  *
  * The fix: post-pad/trim audio to *exactly* `frameCount / fps` seconds at
- * assemble time. Pad with `apad=pad_dur=…` (silence fill), trim with `-t`.
+ * assemble time. Pad by concat-copying a generated silence tail, trim with
+ * `-t`, and avoid re-encoding the already mixed source AAC in either case.
  */
 
 import { spawn } from "node:child_process";
+import { rmSync, writeFileSync } from "node:fs";
 import {
   extractAudioMetadata,
   formatFfmpegError,
@@ -45,6 +47,12 @@ export interface ProbeVideoFrameInfo {
 export interface AudioProbeInfo {
   /** Decoded duration in seconds. */
   durationSeconds: number;
+  /** Audio sample rate in Hz. Used when generating pad silence. */
+  sampleRate?: number;
+  /** Audio channel count. Used when generating pad silence. */
+  channels?: number;
+  /** Codec name reported by ffprobe. */
+  audioCodec?: string;
 }
 
 export interface PadTrimAudioInput {
@@ -78,49 +86,95 @@ export interface PadTrimAudioResult {
   error?: string;
 }
 
+export type PadTrimAudioStepKind = "copy" | "trim" | "pad-silence" | "pad-concat";
+
+export interface PadTrimAudioStep {
+  kind: PadTrimAudioStepKind;
+  args: string[];
+}
+
+export interface PadTrimAudioPlan {
+  operation: PadTrimOperation;
+  steps: PadTrimAudioStep[];
+  concatList?: { path: string; contents: string };
+  cleanupPaths: string[];
+}
+
 /**
- * Pure helper: decide the pad/trim operation and build the ffmpeg argv list
- * that materializes it. Exported separately so unit tests can pin both
- * branches without spawning ffmpeg.
+ * Pure helper: decide the pad/trim operation and build the ffmpeg argv
+ * sequence that materializes it. Exported separately so unit tests can pin
+ * every branch without spawning ffmpeg.
  *
- *   - `sourceDuration < targetDuration` → pad with `apad=pad_dur=Δ`.
- *     Re-encode is required: `apad` is a filter and filters can't combine
- *     with `-c:a copy`.
+ *   - `sourceDuration < targetDuration` → generate only the missing silence
+ *     tail, then concat-copy the source AAC plus that tail. This avoids
+ *     re-encoding the already mixed `audio.aac`; the pad branch remains the
+ *     inverse of trim instead of becoming a second full-source AAC encode.
  *   - `sourceDuration > targetDuration` → trim with `-t target`. `-c:a copy`
  *     is preserved when the input is already AAC.
  *   - `|Δ| < AUDIO_DURATION_TOLERANCE_SECONDS` → no-op `copy`, but we still
  *     run ffmpeg with `-c:a copy` to materialize the output path.
  */
-export function buildPadTrimAudioArgs(
+export function buildPadTrimAudioPlan(
   audioPath: string,
   outputPath: string,
   sourceDurationSeconds: number,
   targetDurationSeconds: number,
-): { args: string[]; operation: PadTrimOperation } {
+  audioInfo: Pick<AudioProbeInfo, "sampleRate" | "channels"> = {},
+): PadTrimAudioPlan {
   const delta = targetDurationSeconds - sourceDurationSeconds;
   const targetSec = formatSeconds(targetDurationSeconds);
   if (Math.abs(delta) < AUDIO_DURATION_TOLERANCE_SECONDS) {
     return {
       operation: "copy",
-      args: ["-i", audioPath, "-c:a", "copy", "-y", outputPath],
+      steps: [{ kind: "copy", args: ["-i", audioPath, "-c:a", "copy", "-y", outputPath] }],
+      cleanupPaths: [],
     };
   }
   if (delta > 0) {
     const padDur = formatSeconds(delta);
+    const silencePath = `${outputPath}.pad-silence.aac`;
+    const concatListPath = `${outputPath}.pad-concat.txt`;
     return {
       operation: "pad",
-      args: [
-        "-i",
-        audioPath,
-        "-af",
-        `apad=pad_dur=${padDur}`,
-        "-c:a",
-        "aac",
-        "-b:a",
-        "192k",
-        "-y",
-        outputPath,
+      steps: [
+        {
+          kind: "pad-silence",
+          args: [
+            "-f",
+            "lavfi",
+            "-i",
+            `anullsrc=channel_layout=${channelLayoutForChannels(audioInfo.channels)}:sample_rate=${sampleRateForFilter(audioInfo.sampleRate)}`,
+            "-t",
+            padDur,
+            "-c:a",
+            "aac",
+            "-b:a",
+            "192k",
+            "-y",
+            silencePath,
+          ],
+        },
+        {
+          kind: "pad-concat",
+          args: [
+            "-f",
+            "concat",
+            "-safe",
+            "0",
+            "-i",
+            concatListPath,
+            "-c:a",
+            "copy",
+            "-y",
+            outputPath,
+          ],
+        },
       ],
+      concatList: {
+        path: concatListPath,
+        contents: `${concatFileLine(audioPath)}\n${concatFileLine(silencePath)}\n`,
+      },
+      cleanupPaths: [silencePath, concatListPath],
     };
   }
   // Trim. `-t` truncates AAC without re-encoding because AAC frames are
@@ -128,10 +182,28 @@ export function buildPadTrimAudioArgs(
   // packet boundary, fine for the ±1ms tolerance we care about here.
   return {
     operation: "trim",
-    args: ["-i", audioPath, "-t", targetSec, "-c:a", "copy", "-y", outputPath],
+    steps: [
+      { kind: "trim", args: ["-i", audioPath, "-t", targetSec, "-c:a", "copy", "-y", outputPath] },
+    ],
+    cleanupPaths: [],
   };
 }
 
+export function buildPadTrimAudioArgs(
+  audioPath: string,
+  outputPath: string,
+  sourceDurationSeconds: number,
+  targetDurationSeconds: number,
+): { args: string[]; operation: PadTrimOperation } {
+  const plan = buildPadTrimAudioPlan(
+    audioPath,
+    outputPath,
+    sourceDurationSeconds,
+    targetDurationSeconds,
+  );
+  return { operation: plan.operation, args: plan.steps[0]?.args ?? [] };
+}
+
 /**
  * Format a duration as a fixed-precision decimal string. ffmpeg parses
  * scientific notation inconsistently across versions (some treat `1e-3` as
@@ -142,6 +214,24 @@ function formatSeconds(sec: number): string {
   return sec.toFixed(6);
 }
 
+function sampleRateForFilter(sampleRate: number | undefined): number {
+  return sampleRate !== undefined && Number.isFinite(sampleRate) && sampleRate > 0
+    ? Math.round(sampleRate)
+    : 48000;
+}
+
+function channelLayoutForChannels(channels: number | undefined): string {
+  if (channels === 1) return "mono";
+  if (channels === 6) return "5.1";
+  if (channels === 8) return "7.1";
+  return "stereo";
+}
+
+function concatFileLine(path: string): string {
+  const normalized = path.replace(/\\/g, "/");
+  return `file '${normalized.replace(/'/g, "'\\''")}'`;
+}
+
 /**
  * Pad or trim `audio.aac` so its exact duration matches `frameCount / fps`
  * for the assembled video.
@@ -197,30 +287,50 @@ export async function padOrTrimAudioToVideoFrameCount(
   }
 
   const targetDurationSeconds = (videoInfo.frameCount * videoInfo.fpsDen) / videoInfo.fpsNum;
-  const { args, operation } = buildPadTrimAudioArgs(
+  const plan = buildPadTrimAudioPlan(
     input.audioPath,
     input.outputPath,
     audioInfo.durationSeconds,
     targetDurationSeconds,
+    audioInfo,
   );
 
-  const ffmpegResult = await runner(args);
-  if (!ffmpegResult.success) {
+  try {
+    if (plan.concatList) writeFileSync(plan.concatList.path, plan.concatList.contents, "utf-8");
+
+    for (const step of plan.steps) {
+      const ffmpegResult = await runner(step.args);
+      if (!ffmpegResult.success) {
+        return {
+          success: false,
+          outputPath: input.outputPath,
+          targetDurationSeconds,
+          sourceDurationSeconds: audioInfo.durationSeconds,
+          operation: plan.operation,
+          error: ffmpegResult.error,
+        };
+      }
+    }
+  } catch (err) {
     return {
       success: false,
       outputPath: input.outputPath,
       targetDurationSeconds,
       sourceDurationSeconds: audioInfo.durationSeconds,
-      operation,
-      error: ffmpegResult.error,
+      operation: plan.operation,
+      error: `audioPadTrim: failed to materialize ${plan.operation}: ${
+        err instanceof Error ? err.message : String(err)
+      }`,
     };
+  } finally {
+    for (const path of plan.cleanupPaths) rmSync(path, { force: true });
   }
   return {
     success: true,
     outputPath: input.outputPath,
     targetDurationSeconds,
     sourceDurationSeconds: audioInfo.durationSeconds,
-    operation,
+    operation: plan.operation,
   };
 }
 
@@ -307,7 +417,12 @@ function parseFrameRate(rate: string): { fpsNum: number; fpsDen: number } {
 async function defaultProbeAudioInfo(audioPath: string): Promise<AudioProbeInfo> {
   // extractAudioMetadata is the shared ffprobe wrapper (caches results).
   const metadata: AudioMetadata = await extractAudioMetadata(audioPath);
-  return { durationSeconds: metadata.durationSeconds };
+  return {
+    durationSeconds: metadata.durationSeconds,
+    sampleRate: metadata.sampleRate,
+    channels: metadata.channels,
+    audioCodec: metadata.audioCodec,
+  };
 }
 
 async function defaultRunFfmpeg(args: string[]): Promise<{ success: boolean; error?: string }> {
diff --git a/packages/producer/src/services/render/stages/assembleStage.ts b/packages/producer/src/services/render/stages/assembleStage.ts
index 83ae671f45..c3e4ebc3e7 100644
--- a/packages/producer/src/services/render/stages/assembleStage.ts
+++ b/packages/producer/src/services/render/stages/assembleStage.ts
@@ -60,7 +60,7 @@ export async function runAssembleStage(input: AssembleStageInput): Promise<Assem
       audioOutputPath,
       outputPath,
       abortSignal,
-      undefined,
+      { audioCodec: "aac" },
       job.config.fps,
     );
     assertNotAborted();

From f87b2c41737d38210ab31a0c59b8527cb94247a5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Miguel=20=C3=81ngel?= <miguel.sierra@heygen.com>
Date: Sat, 20 Jun 2026 16:49:06 -0400
Subject: [PATCH 3/4] fix(engine): probe AAC sidecars before mux copy decision

---
 .../engine/src/services/chunkEncoder.test.ts  | 73 +++++++++++++++++++
 packages/engine/src/services/chunkEncoder.ts  | 32 +++++++-
 2 files changed, 102 insertions(+), 3 deletions(-)

diff --git a/packages/engine/src/services/chunkEncoder.test.ts b/packages/engine/src/services/chunkEncoder.test.ts
index fe6b1672f2..d7cd0a8b26 100644
--- a/packages/engine/src/services/chunkEncoder.test.ts
+++ b/packages/engine/src/services/chunkEncoder.test.ts
@@ -18,6 +18,7 @@ afterEach(() => {
   }
   vi.resetModules();
   vi.doUnmock("child_process");
+  vi.doUnmock("../utils/ffprobe.js");
   vi.useRealTimers();
 });
 
@@ -88,6 +89,11 @@ function emitClose(proc: FakeProc, code: number): void {
   proc.emit("close", code);
 }
 
+async function flushMuxCodecResolution(): Promise<void> {
+  await Promise.resolve();
+  await Promise.resolve();
+}
+
 describe("ENCODER_PRESETS", () => {
   it("has draft, standard, and high presets", () => {
     expect(ENCODER_PRESETS).toHaveProperty("draft");
@@ -371,6 +377,7 @@ describe("muxVideoWithAudio audio codec handling", () => {
       { num: 30, den: 1 },
     );
 
+    await flushMuxCodecResolution();
     expect(calls).toHaveLength(1);
     expect(calls[0]!.args).toEqual([
       "-i",
@@ -415,6 +422,7 @@ describe("muxVideoWithAudio audio codec handling", () => {
       { num: 30, den: 1 },
     );
 
+    await flushMuxCodecResolution();
     expect(calls).toHaveLength(1);
     expect(calls[0]!.args).toContain("-c:a");
     expect(calls[0]!.args[calls[0]!.args.indexOf("-c:a") + 1]).toBe("copy");
@@ -428,6 +436,69 @@ describe("muxVideoWithAudio audio codec handling", () => {
     });
   });
 
+  it("probes unknown-extension AAC sidecars before choosing the MP4 copy path", async () => {
+    const { spawn, calls } = createSpawnSpy();
+    const extractAudioMetadata = vi.fn(async () => ({
+      durationSeconds: 1,
+      sampleRate: 48000,
+      channels: 2,
+      audioCodec: "aac",
+    }));
+    vi.resetModules();
+    vi.doMock("child_process", () => ({ spawn }));
+    vi.doMock("../utils/ffprobe.js", () => ({ extractAudioMetadata }));
+
+    const { muxVideoWithAudio } = await import("./chunkEncoder.js");
+    const muxPromise = muxVideoWithAudio(
+      "/tmp/video-only.mp4",
+      "/tmp/audio-sidecar",
+      "/tmp/output.mp4",
+    );
+
+    await flushMuxCodecResolution();
+    expect(extractAudioMetadata).toHaveBeenCalledWith("/tmp/audio-sidecar");
+    expect(calls).toHaveLength(1);
+    expect(calls[0]!.args).toContain("-c:a");
+    expect(calls[0]!.args[calls[0]!.args.indexOf("-c:a") + 1]).toBe("copy");
+    expect(calls[0]!.args).not.toContain("-b:a");
+
+    emitClose(calls[0]!.proc, 0);
+    await expect(muxPromise).resolves.toMatchObject({
+      success: true,
+      outputPath: "/tmp/output.mp4",
+    });
+  });
+
+  it("keeps probed non-AAC unknown-extension sidecars on the MP4 transcode path", async () => {
+    const { spawn, calls } = createSpawnSpy();
+    const extractAudioMetadata = vi.fn(async () => ({
+      durationSeconds: 1,
+      sampleRate: 48000,
+      channels: 2,
+      audioCodec: "mp3",
+    }));
+    vi.resetModules();
+    vi.doMock("child_process", () => ({ spawn }));
+    vi.doMock("../utils/ffprobe.js", () => ({ extractAudioMetadata }));
+
+    const { muxVideoWithAudio } = await import("./chunkEncoder.js");
+    const muxPromise = muxVideoWithAudio(
+      "/tmp/video-only.mp4",
+      "/tmp/audio-sidecar",
+      "/tmp/output.mp4",
+    );
+
+    await flushMuxCodecResolution();
+    expect(extractAudioMetadata).toHaveBeenCalledWith("/tmp/audio-sidecar");
+    expect(calls).toHaveLength(1);
+    expect(calls[0]!.args).toContain("-c:a");
+    expect(calls[0]!.args[calls[0]!.args.indexOf("-c:a") + 1]).toBe("aac");
+    expect(calls[0]!.args).toContain("-b:a");
+
+    emitClose(calls[0]!.proc, 0);
+    await expect(muxPromise).resolves.toMatchObject({ success: true });
+  });
+
   it("still transcodes non-AAC audio when muxing MP4", async () => {
     const { spawn, calls } = createSpawnSpy();
     vi.resetModules();
@@ -440,6 +511,7 @@ describe("muxVideoWithAudio audio codec handling", () => {
       "/tmp/output.mp4",
     );
 
+    await flushMuxCodecResolution();
     expect(calls).toHaveLength(1);
     expect(calls[0]!.args).toContain("-c:a");
     expect(calls[0]!.args[calls[0]!.args.indexOf("-c:a") + 1]).toBe("aac");
@@ -462,6 +534,7 @@ describe("muxVideoWithAudio audio codec handling", () => {
       "/tmp/output.mov",
     );
 
+    await flushMuxCodecResolution();
     expect(calls).toHaveLength(1);
     expect(calls[0]!.args).toContain("-c:a");
     expect(calls[0]!.args[calls[0]!.args.indexOf("-c:a") + 1]).toBe("copy");
diff --git a/packages/engine/src/services/chunkEncoder.ts b/packages/engine/src/services/chunkEncoder.ts
index 18ecabd50e..2e21a97fd6 100644
--- a/packages/engine/src/services/chunkEncoder.ts
+++ b/packages/engine/src/services/chunkEncoder.ts
@@ -20,6 +20,7 @@ import {
 import { type HdrTransfer, getHdrEncoderColorParams } from "../utils/hdr.js";
 import { formatFfmpegError, runFfmpeg } from "../utils/runFfmpeg.js";
 import { getFfmpegBinary } from "../utils/ffmpegBinaries.js";
+import { extractAudioMetadata } from "../utils/ffprobe.js";
 import { type Fps, fpsToFfmpegArg } from "@hyperframes/core";
 import type { EncoderOptions, EncodeResult, MuxResult } from "./chunkEncoder.types.js";
 
@@ -48,6 +49,16 @@ function isAacSidecar(audioPath: string): boolean {
   return extname(audioPath).toLowerCase() === ".aac";
 }
 
+const KNOWN_NON_AAC_AUDIO_EXTENSIONS = new Set([
+  ".flac",
+  ".mp3",
+  ".oga",
+  ".ogg",
+  ".opus",
+  ".wav",
+  ".webm",
+]);
+
 export interface MuxVideoWithAudioOptions extends Partial<
   Pick<EngineConfig, "ffmpegProcessTimeout">
 > {
@@ -59,8 +70,23 @@ export interface MuxVideoWithAudioOptions extends Partial<
   audioCodec?: "aac";
 }
 
-function shouldCopyAacSidecar(audioPath: string, options: MuxVideoWithAudioOptions | undefined) {
-  return options?.audioCodec === "aac" || isAacSidecar(audioPath);
+async function shouldCopyAacSidecar(
+  audioPath: string,
+  options: MuxVideoWithAudioOptions | undefined,
+) {
+  if (options?.audioCodec === "aac" || isAacSidecar(audioPath)) return true;
+
+  const audioExtension = extname(audioPath).toLowerCase();
+  if (KNOWN_NON_AAC_AUDIO_EXTENSIONS.has(audioExtension)) return false;
+
+  try {
+    const metadata = await extractAudioMetadata(audioPath);
+    return metadata.audioCodec === "aac";
+  } catch {
+    // Preserve the pre-existing fallback for invalid or unprobeable sidecars:
+    // let the final ffmpeg transcode path surface the actionable mux error.
+    return false;
+  }
 }
 
 /**
@@ -717,7 +743,7 @@ export async function muxVideoWithAudio(
 
   const isWebm = outputPath.endsWith(".webm");
   const isMov = outputPath.endsWith(".mov");
-  const shouldCopyAudio = shouldCopyAacSidecar(audioPath, config);
+  const shouldCopyAudio = isWebm ? false : await shouldCopyAacSidecar(audioPath, config);
   const args = ["-i", videoPath, "-i", audioPath, "-c:v", "copy"];
 
   if (isWebm) {

From 2f2bc2ed343a354b5cea242ecd267127025611bf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Miguel=20=C3=81ngel?= <miguel.sierra@heygen.com>
Date: Sat, 20 Jun 2026 16:57:45 -0400
Subject: [PATCH 4/4] fix(producer): avoid temp concat file for audio padding

---
 .../src/services/render/audioPadTrim.test.ts  | 10 ++-
 .../src/services/render/audioPadTrim.ts       | 70 +++++++++++++++----
 2 files changed, 59 insertions(+), 21 deletions(-)

diff --git a/packages/producer/src/services/render/audioPadTrim.test.ts b/packages/producer/src/services/render/audioPadTrim.test.ts
index 821537a65c..cbf62e9623 100644
--- a/packages/producer/src/services/render/audioPadTrim.test.ts
+++ b/packages/producer/src/services/render/audioPadTrim.test.ts
@@ -44,14 +44,12 @@ describe("buildPadTrimAudioArgs", () => {
     const concatArgs = plan.steps[1]!.args;
     expect(plan.steps[1]!.kind).toBe("pad-concat");
     expect(concatArgs).toContain("concat");
+    expect(concatArgs[concatArgs.indexOf("-i") + 1]).toBe("pipe:0");
     expect(concatArgs[concatArgs.indexOf("-c:a") + 1]).toBe("copy");
     expect(concatArgs[concatArgs.length - 1]).toBe("/tmp/out.aac");
-    expect(plan.concatList?.contents).toContain("file '/tmp/in.aac'");
-    expect(plan.concatList?.contents).toContain("file '/tmp/out.aac.pad-silence.aac'");
-    expect(plan.cleanupPaths).toEqual([
-      "/tmp/out.aac.pad-silence.aac",
-      "/tmp/out.aac.pad-concat.txt",
-    ]);
+    expect(plan.steps[1]!.stdin).toContain("file 'file:///tmp/in.aac'");
+    expect(plan.steps[1]!.stdin).toContain("file 'file:///tmp/out.aac.pad-silence.aac'");
+    expect(plan.cleanupPaths).toEqual(["/tmp/out.aac.pad-silence.aac"]);
 
     const reencodedSourceStep = plan.steps.find(
       (step) =>
diff --git a/packages/producer/src/services/render/audioPadTrim.ts b/packages/producer/src/services/render/audioPadTrim.ts
index e94008ee04..b089b09a81 100644
--- a/packages/producer/src/services/render/audioPadTrim.ts
+++ b/packages/producer/src/services/render/audioPadTrim.ts
@@ -19,10 +19,12 @@
  */
 
 import { spawn } from "node:child_process";
-import { rmSync, writeFileSync } from "node:fs";
+import { rmSync } from "node:fs";
+import { pathToFileURL } from "node:url";
 import {
   extractAudioMetadata,
   formatFfmpegError,
+  getFfmpegBinary,
   getFfprobeBinary,
   runFfmpeg,
   type AudioMetadata,
@@ -68,7 +70,10 @@ export interface PadTrimAudioInput {
    */
   probeVideoFrameInfo?: (videoPath: string) => Promise<ProbeVideoFrameInfo>;
   probeAudioInfo?: (audioPath: string) => Promise<AudioProbeInfo>;
-  runFfmpeg?: (args: string[]) => Promise<{ success: boolean; error?: string }>;
+  runFfmpeg?: (
+    args: string[],
+    options?: { stdin?: string },
+  ) => Promise<{ success: boolean; error?: string }>;
 }
 
 export type PadTrimOperation = "pad" | "trim" | "copy";
@@ -91,12 +96,12 @@ export type PadTrimAudioStepKind = "copy" | "trim" | "pad-silence" | "pad-concat
 export interface PadTrimAudioStep {
   kind: PadTrimAudioStepKind;
   args: string[];
+  stdin?: string;
 }
 
 export interface PadTrimAudioPlan {
   operation: PadTrimOperation;
   steps: PadTrimAudioStep[];
-  concatList?: { path: string; contents: string };
   cleanupPaths: string[];
 }
 
@@ -133,7 +138,6 @@ export function buildPadTrimAudioPlan(
   if (delta > 0) {
     const padDur = formatSeconds(delta);
     const silencePath = `${outputPath}.pad-silence.aac`;
-    const concatListPath = `${outputPath}.pad-concat.txt`;
     return {
       operation: "pad",
       steps: [
@@ -161,20 +165,19 @@ export function buildPadTrimAudioPlan(
             "concat",
             "-safe",
             "0",
+            "-protocol_whitelist",
+            "file,pipe,crypto,data",
             "-i",
-            concatListPath,
+            "pipe:0",
             "-c:a",
             "copy",
             "-y",
             outputPath,
           ],
+          stdin: `${concatFileLine(audioPath)}\n${concatFileLine(silencePath)}\n`,
         },
       ],
-      concatList: {
-        path: concatListPath,
-        contents: `${concatFileLine(audioPath)}\n${concatFileLine(silencePath)}\n`,
-      },
-      cleanupPaths: [silencePath, concatListPath],
+      cleanupPaths: [silencePath],
     };
   }
   // Trim. `-t` truncates AAC without re-encoding because AAC frames are
@@ -228,7 +231,7 @@ function channelLayoutForChannels(channels: number | undefined): string {
 }
 
 function concatFileLine(path: string): string {
-  const normalized = path.replace(/\\/g, "/");
+  const normalized = pathToFileURL(path).href;
   return `file '${normalized.replace(/'/g, "'\\''")}'`;
 }
 
@@ -296,10 +299,8 @@ export async function padOrTrimAudioToVideoFrameCount(
   );
 
   try {
-    if (plan.concatList) writeFileSync(plan.concatList.path, plan.concatList.contents, "utf-8");
-
     for (const step of plan.steps) {
-      const ffmpegResult = await runner(step.args);
+      const ffmpegResult = await runner(step.args, { stdin: step.stdin });
       if (!ffmpegResult.success) {
         return {
           success: false,
@@ -425,7 +426,12 @@ async function defaultProbeAudioInfo(audioPath: string): Promise<AudioProbeInfo>
   };
 }
 
-async function defaultRunFfmpeg(args: string[]): Promise<{ success: boolean; error?: string }> {
+async function defaultRunFfmpeg(
+  args: string[],
+  options?: { stdin?: string },
+): Promise<{ success: boolean; error?: string }> {
+  if (options?.stdin !== undefined) return runFfmpegWithStdin(args, options.stdin);
+
   const result = await runFfmpeg(args);
   if (result.success) return { success: true };
   return {
@@ -434,6 +440,40 @@ async function defaultRunFfmpeg(args: string[]): Promise<{ success: boolean; err
   };
 }
 
+async function runFfmpegWithStdin(
+  args: string[],
+  stdin: string,
+): Promise<{ success: boolean; error?: string }> {
+  return new Promise((resolve) => {
+    const proc = spawn(getFfmpegBinary(), args);
+    let stderr = "";
+
+    proc.stderr.on("data", (data: Buffer) => {
+      stderr += data.toString();
+    });
+
+    proc.on("error", (err) => {
+      resolve({
+        success: false,
+        error: `[audioPadTrim] ${err instanceof Error ? err.message : String(err)}`,
+      });
+    });
+
+    proc.on("close", (code) => {
+      if (code === 0) {
+        resolve({ success: true });
+        return;
+      }
+      resolve({
+        success: false,
+        error: `[audioPadTrim] ${formatFfmpegError(code, stderr)}`,
+      });
+    });
+
+    proc.stdin.end(stdin);
+  });
+}
+
 // ── ffprobe JSON runner (shared between fast/slow video probe paths) ─────
 
 function runFfprobeJson<T>(args: string[]): Promise<T> {