From ba1afa13722f8e38310abb68ee0246fda96b839a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miguel=20=C3=81ngel?= Date: Sat, 20 Jun 2026 16:21:47 -0400 Subject: [PATCH 1/4] fix(engine): copy mixed AAC during MP4 mux --- .../engine/src/services/chunkEncoder.test.ts | 111 ++++++++++++++++++ packages/engine/src/services/chunkEncoder.ts | 22 +++- .../src/services/distributed/assemble.test.ts | 6 +- 3 files changed, 135 insertions(+), 4 deletions(-) diff --git a/packages/engine/src/services/chunkEncoder.test.ts b/packages/engine/src/services/chunkEncoder.test.ts index 5b0091639a..b358fe57c7 100644 --- a/packages/engine/src/services/chunkEncoder.test.ts +++ b/packages/engine/src/services/chunkEncoder.test.ts @@ -355,6 +355,117 @@ describe("encodeFramesChunkedConcat ffmpegEncodeTimeout", () => { }); }); +describe("muxVideoWithAudio audio codec handling", () => { + it("copies HyperFrames AAC sidecars into MP4 instead of re-encoding", async () => { + const { spawn, calls } = createSpawnSpy(); + vi.resetModules(); + vi.doMock("child_process", () => ({ spawn })); + + const { muxVideoWithAudio } = await import("./chunkEncoder.js"); + const muxPromise = muxVideoWithAudio( + "/tmp/video-only.mp4", + "/tmp/audio.aac", + "/tmp/output.mp4", + undefined, + undefined, + { num: 30, den: 1 }, + ); + + expect(calls).toHaveLength(1); + expect(calls[0]!.args).toEqual([ + "-i", + "/tmp/video-only.mp4", + "-i", + "/tmp/audio.aac", + "-c:v", + "copy", + "-c:a", + "copy", + "-movflags", + "+faststart", + "-avoid_negative_ts", + "make_zero", + "-r", + "30", + "-shortest", + "-y", + "/tmp/output.mp4", + ]); + expect(calls[0]!.args).not.toContain("-use_editlist"); + + emitClose(calls[0]!.proc, 0); + await expect(muxPromise).resolves.toMatchObject({ + success: true, + outputPath: "/tmp/output.mp4", + }); + }); + + it("still transcodes non-AAC audio when muxing MP4", async () => { + const { spawn, calls } = createSpawnSpy(); + vi.resetModules(); + vi.doMock("child_process", () => ({ spawn })); + + const { muxVideoWithAudio } = await import("./chunkEncoder.js"); + const muxPromise = muxVideoWithAudio( + "/tmp/video-only.mp4", + "/tmp/audio.wav", + "/tmp/output.mp4", + ); + + expect(calls).toHaveLength(1); + expect(calls[0]!.args).toContain("-c:a"); + expect(calls[0]!.args[calls[0]!.args.indexOf("-c:a") + 1]).toBe("aac"); + expect(calls[0]!.args).toContain("-b:a"); + expect(calls[0]!.args).toContain("+faststart"); + + emitClose(calls[0]!.proc, 0); + await expect(muxPromise).resolves.toMatchObject({ success: true }); + }); + + it("copies HyperFrames AAC sidecars into MOV containers", async () => { + const { spawn, calls } = createSpawnSpy(); + vi.resetModules(); + vi.doMock("child_process", () => ({ spawn })); + + const { muxVideoWithAudio } = await import("./chunkEncoder.js"); + const muxPromise = muxVideoWithAudio( + "/tmp/video-only.mov", + "/tmp/audio.aac", + "/tmp/output.mov", + ); + + expect(calls).toHaveLength(1); + expect(calls[0]!.args).toContain("-c:a"); + expect(calls[0]!.args[calls[0]!.args.indexOf("-c:a") + 1]).toBe("copy"); + expect(calls[0]!.args).not.toContain("-b:a"); + expect(calls[0]!.args).not.toContain("+faststart"); + + emitClose(calls[0]!.proc, 0); + await expect(muxPromise).resolves.toMatchObject({ success: true }); + }); + + it("keeps WebM audio on the Opus transcode path", async () => { + const { spawn, calls } = createSpawnSpy(); + vi.resetModules(); + vi.doMock("child_process", () => ({ spawn })); + + const { muxVideoWithAudio } = await import("./chunkEncoder.js"); + const muxPromise = muxVideoWithAudio( + "/tmp/video-only.webm", + "/tmp/audio.aac", + "/tmp/output.webm", + ); + + expect(calls).toHaveLength(1); + expect(calls[0]!.args).toContain("-c:a"); + expect(calls[0]!.args[calls[0]!.args.indexOf("-c:a") + 1]).toBe("libopus"); + expect(calls[0]!.args).not.toContain("+faststart"); + + emitClose(calls[0]!.proc, 0); + await expect(muxPromise).resolves.toMatchObject({ success: true }); + }); +}); + describe("getEncoderPreset", () => { it("returns h264 with yuv420p for mp4 format", () => { const preset = getEncoderPreset("standard", "mp4"); diff --git a/packages/engine/src/services/chunkEncoder.ts b/packages/engine/src/services/chunkEncoder.ts index fb11da4680..7df4ff03fd 100644 --- a/packages/engine/src/services/chunkEncoder.ts +++ b/packages/engine/src/services/chunkEncoder.ts @@ -8,7 +8,7 @@ import { spawn } from "child_process"; import { copyFileSync, existsSync, mkdirSync, readdirSync, statSync, writeFileSync } from "fs"; -import { join, dirname } from "path"; +import { join, dirname, extname } from "path"; import { trackChildProcess } from "../utils/processTracker.js"; import { DEFAULT_CONFIG, type EngineConfig } from "../config.js"; import { @@ -44,6 +44,10 @@ function appendEncodeTimeoutMessage(error: string, timedOut: boolean, timeoutMs: return `${error}\nFFmpeg killed after exceeding ffmpegEncodeTimeout (${timeoutMs} ms)`; } +function isAacSidecar(audioPath: string): boolean { + return extname(audioPath).toLowerCase() === ".aac"; +} + /** * Get encoder preset for a given quality and output format. * WebM uses VP9 with alpha-capable pixel format; MP4 uses h264 (or h265 for HDR); @@ -703,9 +707,21 @@ export async function muxVideoWithAudio( if (isWebm) { args.push("-c:a", "libopus", "-b:a", "128k"); } else if (isMov) { - args.push("-c:a", "aac", "-b:a", "192k"); + if (isAacSidecar(audioPath)) { + args.push("-c:a", "copy"); + } else { + args.push("-c:a", "aac", "-b:a", "192k"); + } } else { - args.push("-c:a", "aac", "-b:a", "192k", "-movflags", "+faststart"); + // HyperFrames' audio mixer already writes an AAC sidecar. Re-encoding + // that AAC during MP4 mux adds a second encoder-priming interval; ffmpeg + // preserves the gap as an empty video edit list, which QuickTime/Safari + // render as a black first frame. Copy the mixed sidecar instead. + if (isAacSidecar(audioPath)) { + args.push("-c:a", "copy", "-movflags", "+faststart"); + } else { + args.push("-c:a", "aac", "-b:a", "192k", "-movflags", "+faststart"); + } } // PTS bases can diverge during mux and reintroduce negative DTS. See // buildEncoderArgs for the full reasoning on why that breaks playback. diff --git a/packages/producer/src/services/distributed/assemble.test.ts b/packages/producer/src/services/distributed/assemble.test.ts index 74599f8e4a..a12d644173 100644 --- a/packages/producer/src/services/distributed/assemble.test.ts +++ b/packages/producer/src/services/distributed/assemble.test.ts @@ -146,7 +146,7 @@ function probeStream( "-select_streams", streamSelector, "-show_entries", - "stream=duration,nb_frames,nb_read_packets,codec_name,r_frame_rate", + "stream=start_time,duration,nb_frames,nb_read_packets,codec_name,r_frame_rate", "-count_packets", "-of", "json", @@ -316,6 +316,10 @@ describe("assemble()", () => { const audioStream = probeStream(outputPath, "a:0"); expect(audioStream).toBeDefined(); expect(audioStream?.codec_name).toBe("aac"); + const videoStream = probeStream(outputPath, "v:0"); + expect(videoStream).toBeDefined(); + expect(Number(videoStream?.start_time ?? NaN)).toBeLessThan(0.001); + expect(Number(audioStream?.start_time ?? NaN)).toBeLessThan(0.001); // Audio duration should be within ~25ms of `totalFrames / fps` after // pad/trim. The 25ms tolerance absorbs AAC frame quantization (1024 // samples @ 48kHz = ~21ms). From 7ef576949499353e7077bcff3a3d1a24206328d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miguel=20=C3=81ngel?= Date: Sat, 20 Jun 2026 16:41:29 -0400 Subject: [PATCH 2/4] fix(producer): avoid AAC re-encode in distributed audio pad --- .../engine/src/services/chunkEncoder.test.ts | 30 ++- packages/engine/src/services/chunkEncoder.ts | 31 +++- .../src/services/distributed/assemble.test.ts | 42 +++++ .../src/services/distributed/assemble.ts | 2 +- .../src/services/render/audioPadTrim.test.ts | 68 +++++-- .../src/services/render/audioPadTrim.ts | 173 +++++++++++++++--- .../services/render/stages/assembleStage.ts | 2 +- 7 files changed, 291 insertions(+), 57 deletions(-) diff --git a/packages/engine/src/services/chunkEncoder.test.ts b/packages/engine/src/services/chunkEncoder.test.ts index b358fe57c7..fe6b1672f2 100644 --- a/packages/engine/src/services/chunkEncoder.test.ts +++ b/packages/engine/src/services/chunkEncoder.test.ts @@ -400,6 +400,34 @@ describe("muxVideoWithAudio audio codec handling", () => { }); }); + it("uses the caller-provided AAC codec contract instead of the sidecar extension", async () => { + const { spawn, calls } = createSpawnSpy(); + vi.resetModules(); + vi.doMock("child_process", () => ({ spawn })); + + const { muxVideoWithAudio } = await import("./chunkEncoder.js"); + const muxPromise = muxVideoWithAudio( + "/tmp/video-only.mp4", + "/tmp/audio-sidecar", + "/tmp/output.mp4", + undefined, + { audioCodec: "aac" }, + { num: 30, den: 1 }, + ); + + expect(calls).toHaveLength(1); + expect(calls[0]!.args).toContain("-c:a"); + expect(calls[0]!.args[calls[0]!.args.indexOf("-c:a") + 1]).toBe("copy"); + expect(calls[0]!.args).not.toContain("-b:a"); + expect(calls[0]!.args).toContain("+faststart"); + + emitClose(calls[0]!.proc, 0); + await expect(muxPromise).resolves.toMatchObject({ + success: true, + outputPath: "/tmp/output.mp4", + }); + }); + it("still transcodes non-AAC audio when muxing MP4", async () => { const { spawn, calls } = createSpawnSpy(); vi.resetModules(); @@ -422,7 +450,7 @@ describe("muxVideoWithAudio audio codec handling", () => { await expect(muxPromise).resolves.toMatchObject({ success: true }); }); - it("copies HyperFrames AAC sidecars into MOV containers", async () => { + it("copies HyperFrames AAC sidecars into MOV containers without MP4 faststart flags", async () => { const { spawn, calls } = createSpawnSpy(); vi.resetModules(); vi.doMock("child_process", () => ({ spawn })); diff --git a/packages/engine/src/services/chunkEncoder.ts b/packages/engine/src/services/chunkEncoder.ts index 7df4ff03fd..18ecabd50e 100644 --- a/packages/engine/src/services/chunkEncoder.ts +++ b/packages/engine/src/services/chunkEncoder.ts @@ -48,6 +48,21 @@ function isAacSidecar(audioPath: string): boolean { return extname(audioPath).toLowerCase() === ".aac"; } +export interface MuxVideoWithAudioOptions extends Partial< + Pick +> { + /** + * Codec of the sidecar audio when the caller already knows it. HyperFrames + * render paths pass the mixed AAC sidecar by contract, so muxing should not + * depend on the file extension alone. + */ + audioCodec?: "aac"; +} + +function shouldCopyAacSidecar(audioPath: string, options: MuxVideoWithAudioOptions | undefined) { + return options?.audioCodec === "aac" || isAacSidecar(audioPath); +} + /** * Get encoder preset for a given quality and output format. * WebM uses VP9 with alpha-capable pixel format; MP4 uses h264 (or h265 for HDR); @@ -694,7 +709,7 @@ export async function muxVideoWithAudio( audioPath: string, outputPath: string, signal?: AbortSignal, - config?: Partial>, + config?: MuxVideoWithAudioOptions, fps?: Fps, ): Promise { const outputDir = dirname(outputPath); @@ -702,22 +717,24 @@ export async function muxVideoWithAudio( const isWebm = outputPath.endsWith(".webm"); const isMov = outputPath.endsWith(".mov"); + const shouldCopyAudio = shouldCopyAacSidecar(audioPath, config); const args = ["-i", videoPath, "-i", audioPath, "-c:v", "copy"]; if (isWebm) { args.push("-c:a", "libopus", "-b:a", "128k"); } else if (isMov) { - if (isAacSidecar(audioPath)) { + if (shouldCopyAudio) { args.push("-c:a", "copy"); } else { args.push("-c:a", "aac", "-b:a", "192k"); } } else { - // HyperFrames' audio mixer already writes an AAC sidecar. Re-encoding - // that AAC during MP4 mux adds a second encoder-priming interval; ffmpeg - // preserves the gap as an empty video edit list, which QuickTime/Safari - // render as a black first frame. Copy the mixed sidecar instead. - if (isAacSidecar(audioPath)) { + // processCompositionAudio (audioMixer.ts) performs the AAC encode and + // owns the single encoder-priming interval. Copying that sidecar into + // MP4 preserves the correct priming metadata; re-encoding it during mux + // creates another priming interval that ffmpeg writes as an empty leading + // video edit list, which QuickTime/Safari render as a black first frame. + if (shouldCopyAudio) { args.push("-c:a", "copy", "-movflags", "+faststart"); } else { args.push("-c:a", "aac", "-b:a", "192k", "-movflags", "+faststart"); diff --git a/packages/producer/src/services/distributed/assemble.test.ts b/packages/producer/src/services/distributed/assemble.test.ts index a12d644173..4cb662097a 100644 --- a/packages/producer/src/services/distributed/assemble.test.ts +++ b/packages/producer/src/services/distributed/assemble.test.ts @@ -330,6 +330,48 @@ describe("assemble()", () => { TIMEOUT_MS, ); + it( + "muxes padded short audio without shifting the first video frame", + async () => { + if (!hasFfmpeg) return; + + const chunks: ChunkSliceJson[] = [ + { index: 0, startFrame: 0, endFrame: 6 }, + { index: 1, startFrame: 6, endFrame: 12 }, + ]; + const totalFrames = 12; + const fps = 30; + const planDir = buildPlanDir("mp4", chunks, totalFrames, true); + + const chunkAPath = join(planDir, "chunk-0.mp4"); + const chunkBPath = join(planDir, "chunk-1.mp4"); + const audioPath = join(planDir, "audio.aac"); + makeMp4Chunk(chunkAPath, 6); + makeMp4Chunk(chunkBPath, 6); + // Audio is shorter than the video, forcing the distributed pad branch. + makeAacAudio(audioPath, totalFrames / fps - 0.2); + + const outputPath = join(planDir, "output-audio-padded.mp4"); + const result = await assemble(planDir, [chunkAPath, chunkBPath], audioPath, outputPath); + + expect(existsSync(outputPath)).toBe(true); + expect(result.framesEncoded).toBe(totalFrames); + + const audioStream = probeStream(outputPath, "a:0"); + expect(audioStream).toBeDefined(); + expect(audioStream?.codec_name).toBe("aac"); + const videoStream = probeStream(outputPath, "v:0"); + expect(videoStream).toBeDefined(); + expect(Number(videoStream?.start_time ?? NaN)).toBeLessThan(0.001); + expect(Number(audioStream?.start_time ?? NaN)).toBeLessThan(0.001); + + const audioDuration = Number(audioStream?.duration ?? 0); + const expected = totalFrames / fps; + expect(Math.abs(audioDuration - expected)).toBeLessThan(0.05); + }, + TIMEOUT_MS, + ); + it( "cfr:true re-encodes for exact avg_frame_rate matching r_frame_rate", async () => { diff --git a/packages/producer/src/services/distributed/assemble.ts b/packages/producer/src/services/distributed/assemble.ts index ec6eb98f1d..747240f034 100644 --- a/packages/producer/src/services/distributed/assemble.ts +++ b/packages/producer/src/services/distributed/assemble.ts @@ -320,7 +320,7 @@ export async function assemble( audioForMux, muxOutputPath, abortSignal, - undefined, + { audioCodec: "aac" }, { num: plan.dimensions.fpsNum, den: plan.dimensions.fpsDen }, ); if (!muxResult.success) { diff --git a/packages/producer/src/services/render/audioPadTrim.test.ts b/packages/producer/src/services/render/audioPadTrim.test.ts index 3a73fd1d51..821537a65c 100644 --- a/packages/producer/src/services/render/audioPadTrim.test.ts +++ b/packages/producer/src/services/render/audioPadTrim.test.ts @@ -16,6 +16,7 @@ import { describe, expect, it } from "bun:test"; import { buildPadTrimAudioArgs, + buildPadTrimAudioPlan, padOrTrimAudioToVideoFrameCount, type AudioProbeInfo, type PadTrimAudioInput, @@ -23,18 +24,47 @@ import { } from "./audioPadTrim.js"; describe("buildPadTrimAudioArgs", () => { - it("emits an apad filter when audio is shorter than target", () => { + it("emits a concat-copy pad plan when audio is shorter than target", () => { + const plan = buildPadTrimAudioPlan("/tmp/in.aac", "/tmp/out.aac", 4.0, 5.0, { + sampleRate: 48000, + channels: 2, + }); + expect(plan.operation).toBe("pad"); + expect(plan.steps).toHaveLength(2); + + const silenceArgs = plan.steps[0]!.args; + expect(plan.steps[0]!.kind).toBe("pad-silence"); + expect(silenceArgs).not.toContain("/tmp/in.aac"); + expect(silenceArgs[silenceArgs.indexOf("-i") + 1]).toBe( + "anullsrc=channel_layout=stereo:sample_rate=48000", + ); + expect(silenceArgs[silenceArgs.indexOf("-t") + 1]).toBe("1.000000"); + expect(silenceArgs[silenceArgs.indexOf("-c:a") + 1]).toBe("aac"); + + const concatArgs = plan.steps[1]!.args; + expect(plan.steps[1]!.kind).toBe("pad-concat"); + expect(concatArgs).toContain("concat"); + expect(concatArgs[concatArgs.indexOf("-c:a") + 1]).toBe("copy"); + expect(concatArgs[concatArgs.length - 1]).toBe("/tmp/out.aac"); + expect(plan.concatList?.contents).toContain("file '/tmp/in.aac'"); + expect(plan.concatList?.contents).toContain("file '/tmp/out.aac.pad-silence.aac'"); + expect(plan.cleanupPaths).toEqual([ + "/tmp/out.aac.pad-silence.aac", + "/tmp/out.aac.pad-concat.txt", + ]); + + const reencodedSourceStep = plan.steps.find( + (step) => + step.args.includes("/tmp/in.aac") && step.args[step.args.indexOf("-c:a") + 1] === "aac", + ); + expect(reencodedSourceStep).toBeUndefined(); + }); + + it("keeps the legacy args helper on the first pad materialization step", () => { const { args, operation } = buildPadTrimAudioArgs("/tmp/in.aac", "/tmp/out.aac", 4.0, 5.0); expect(operation).toBe("pad"); - const afIdx = args.indexOf("-af"); - expect(afIdx).toBeGreaterThan(-1); - expect(args[afIdx + 1]).toContain("apad=pad_dur="); - expect(args[afIdx + 1]).toMatch(/pad_dur=1\.0+/); - // Pad must re-encode — apad is a filter and filters can't combine with copy. - const codecIdx = args.indexOf("-c:a"); - expect(args[codecIdx + 1]).toBe("aac"); - expect(args[args.length - 1]).toBe("/tmp/out.aac"); - expect(args.includes("-y")).toBe(true); + expect(args).not.toContain("/tmp/in.aac"); + expect(args[args.indexOf("-t") + 1]).toBe("1.000000"); }); it("emits -t when audio is longer than target", () => { @@ -57,14 +87,14 @@ describe("buildPadTrimAudioArgs", () => { expect(args[codecIdx + 1]).toBe("copy"); }); - it("emits 6-decimal-place pad_dur (no scientific notation)", () => { + it("emits 6-decimal-place pad duration (no scientific notation)", () => { // 1.23ms — just over the AUDIO_DURATION_TOLERANCE_SECONDS=1ms threshold, // so we exercise the pad path with a tiny duration that would round to // exponent notation if we used `toString()` instead of `toFixed(6)`. const { args, operation } = buildPadTrimAudioArgs("/tmp/in.aac", "/tmp/out.aac", 0.0, 0.00123); expect(operation).toBe("pad"); - const afIdx = args.indexOf("-af"); - expect(args[afIdx + 1]).toBe("apad=pad_dur=0.001230"); + const tIdx = args.indexOf("-t"); + expect(args[tIdx + 1]).toBe("0.001230"); }); it("flags ~1ms drift as a copy (below the tolerance threshold)", () => { @@ -120,9 +150,11 @@ describe("padOrTrimAudioToVideoFrameCount", () => { expect(result.operation).toBe("pad"); expect(result.targetDurationSeconds).toBe(6); expect(result.sourceDurationSeconds).toBe(5.5); - expect(captured.args).toHaveLength(1); - const afIdx = captured.args[0]!.indexOf("-af"); - expect(captured.args[0]![afIdx + 1]).toBe("apad=pad_dur=0.500000"); + expect(captured.args).toHaveLength(2); + const tIdx = captured.args[0]!.indexOf("-t"); + expect(captured.args[0]![tIdx + 1]).toBe("0.500000"); + expect(captured.args[0]).not.toContain("/tmp/a.aac"); + expect(captured.args[1]![captured.args[1]!.indexOf("-c:a") + 1]).toBe("copy"); }); it("trims a video of N=120 frames at 30/1 fps with longer audio", async () => { @@ -162,8 +194,8 @@ describe("padOrTrimAudioToVideoFrameCount", () => { expect(result.success).toBe(true); expect(result.operation).toBe("pad"); expect(result.targetDurationSeconds).toBeCloseTo((120 * 1001) / 30000, 9); - const afIdx = captured.args[0]!.indexOf("-af"); - expect(captured.args[0]![afIdx + 1]).toMatch(/^apad=pad_dur=0\.004\d+$/); + const tIdx = captured.args[0]!.indexOf("-t"); + expect(captured.args[0]![tIdx + 1]).toMatch(/^0\.004\d+$/); }); it("propagates video probe failure as success=false", async () => { diff --git a/packages/producer/src/services/render/audioPadTrim.ts b/packages/producer/src/services/render/audioPadTrim.ts index 9f124f5060..e94008ee04 100644 --- a/packages/producer/src/services/render/audioPadTrim.ts +++ b/packages/producer/src/services/render/audioPadTrim.ts @@ -14,10 +14,12 @@ * "audio cuts off early" or "video shows a frozen final frame" bugs. * * The fix: post-pad/trim audio to *exactly* `frameCount / fps` seconds at - * assemble time. Pad with `apad=pad_dur=…` (silence fill), trim with `-t`. + * assemble time. Pad by concat-copying a generated silence tail, trim with + * `-t`, and avoid re-encoding the already mixed source AAC in either case. */ import { spawn } from "node:child_process"; +import { rmSync, writeFileSync } from "node:fs"; import { extractAudioMetadata, formatFfmpegError, @@ -45,6 +47,12 @@ export interface ProbeVideoFrameInfo { export interface AudioProbeInfo { /** Decoded duration in seconds. */ durationSeconds: number; + /** Audio sample rate in Hz. Used when generating pad silence. */ + sampleRate?: number; + /** Audio channel count. Used when generating pad silence. */ + channels?: number; + /** Codec name reported by ffprobe. */ + audioCodec?: string; } export interface PadTrimAudioInput { @@ -78,49 +86,95 @@ export interface PadTrimAudioResult { error?: string; } +export type PadTrimAudioStepKind = "copy" | "trim" | "pad-silence" | "pad-concat"; + +export interface PadTrimAudioStep { + kind: PadTrimAudioStepKind; + args: string[]; +} + +export interface PadTrimAudioPlan { + operation: PadTrimOperation; + steps: PadTrimAudioStep[]; + concatList?: { path: string; contents: string }; + cleanupPaths: string[]; +} + /** - * Pure helper: decide the pad/trim operation and build the ffmpeg argv list - * that materializes it. Exported separately so unit tests can pin both - * branches without spawning ffmpeg. + * Pure helper: decide the pad/trim operation and build the ffmpeg argv + * sequence that materializes it. Exported separately so unit tests can pin + * every branch without spawning ffmpeg. * - * - `sourceDuration < targetDuration` → pad with `apad=pad_dur=Δ`. - * Re-encode is required: `apad` is a filter and filters can't combine - * with `-c:a copy`. + * - `sourceDuration < targetDuration` → generate only the missing silence + * tail, then concat-copy the source AAC plus that tail. This avoids + * re-encoding the already mixed `audio.aac`; the pad branch remains the + * inverse of trim instead of becoming a second full-source AAC encode. * - `sourceDuration > targetDuration` → trim with `-t target`. `-c:a copy` * is preserved when the input is already AAC. * - `|Δ| < AUDIO_DURATION_TOLERANCE_SECONDS` → no-op `copy`, but we still * run ffmpeg with `-c:a copy` to materialize the output path. */ -export function buildPadTrimAudioArgs( +export function buildPadTrimAudioPlan( audioPath: string, outputPath: string, sourceDurationSeconds: number, targetDurationSeconds: number, -): { args: string[]; operation: PadTrimOperation } { + audioInfo: Pick = {}, +): PadTrimAudioPlan { const delta = targetDurationSeconds - sourceDurationSeconds; const targetSec = formatSeconds(targetDurationSeconds); if (Math.abs(delta) < AUDIO_DURATION_TOLERANCE_SECONDS) { return { operation: "copy", - args: ["-i", audioPath, "-c:a", "copy", "-y", outputPath], + steps: [{ kind: "copy", args: ["-i", audioPath, "-c:a", "copy", "-y", outputPath] }], + cleanupPaths: [], }; } if (delta > 0) { const padDur = formatSeconds(delta); + const silencePath = `${outputPath}.pad-silence.aac`; + const concatListPath = `${outputPath}.pad-concat.txt`; return { operation: "pad", - args: [ - "-i", - audioPath, - "-af", - `apad=pad_dur=${padDur}`, - "-c:a", - "aac", - "-b:a", - "192k", - "-y", - outputPath, + steps: [ + { + kind: "pad-silence", + args: [ + "-f", + "lavfi", + "-i", + `anullsrc=channel_layout=${channelLayoutForChannels(audioInfo.channels)}:sample_rate=${sampleRateForFilter(audioInfo.sampleRate)}`, + "-t", + padDur, + "-c:a", + "aac", + "-b:a", + "192k", + "-y", + silencePath, + ], + }, + { + kind: "pad-concat", + args: [ + "-f", + "concat", + "-safe", + "0", + "-i", + concatListPath, + "-c:a", + "copy", + "-y", + outputPath, + ], + }, ], + concatList: { + path: concatListPath, + contents: `${concatFileLine(audioPath)}\n${concatFileLine(silencePath)}\n`, + }, + cleanupPaths: [silencePath, concatListPath], }; } // Trim. `-t` truncates AAC without re-encoding because AAC frames are @@ -128,10 +182,28 @@ export function buildPadTrimAudioArgs( // packet boundary, fine for the ±1ms tolerance we care about here. return { operation: "trim", - args: ["-i", audioPath, "-t", targetSec, "-c:a", "copy", "-y", outputPath], + steps: [ + { kind: "trim", args: ["-i", audioPath, "-t", targetSec, "-c:a", "copy", "-y", outputPath] }, + ], + cleanupPaths: [], }; } +export function buildPadTrimAudioArgs( + audioPath: string, + outputPath: string, + sourceDurationSeconds: number, + targetDurationSeconds: number, +): { args: string[]; operation: PadTrimOperation } { + const plan = buildPadTrimAudioPlan( + audioPath, + outputPath, + sourceDurationSeconds, + targetDurationSeconds, + ); + return { operation: plan.operation, args: plan.steps[0]?.args ?? [] }; +} + /** * Format a duration as a fixed-precision decimal string. ffmpeg parses * scientific notation inconsistently across versions (some treat `1e-3` as @@ -142,6 +214,24 @@ function formatSeconds(sec: number): string { return sec.toFixed(6); } +function sampleRateForFilter(sampleRate: number | undefined): number { + return sampleRate !== undefined && Number.isFinite(sampleRate) && sampleRate > 0 + ? Math.round(sampleRate) + : 48000; +} + +function channelLayoutForChannels(channels: number | undefined): string { + if (channels === 1) return "mono"; + if (channels === 6) return "5.1"; + if (channels === 8) return "7.1"; + return "stereo"; +} + +function concatFileLine(path: string): string { + const normalized = path.replace(/\\/g, "/"); + return `file '${normalized.replace(/'/g, "'\\''")}'`; +} + /** * Pad or trim `audio.aac` so its exact duration matches `frameCount / fps` * for the assembled video. @@ -197,30 +287,50 @@ export async function padOrTrimAudioToVideoFrameCount( } const targetDurationSeconds = (videoInfo.frameCount * videoInfo.fpsDen) / videoInfo.fpsNum; - const { args, operation } = buildPadTrimAudioArgs( + const plan = buildPadTrimAudioPlan( input.audioPath, input.outputPath, audioInfo.durationSeconds, targetDurationSeconds, + audioInfo, ); - const ffmpegResult = await runner(args); - if (!ffmpegResult.success) { + try { + if (plan.concatList) writeFileSync(plan.concatList.path, plan.concatList.contents, "utf-8"); + + for (const step of plan.steps) { + const ffmpegResult = await runner(step.args); + if (!ffmpegResult.success) { + return { + success: false, + outputPath: input.outputPath, + targetDurationSeconds, + sourceDurationSeconds: audioInfo.durationSeconds, + operation: plan.operation, + error: ffmpegResult.error, + }; + } + } + } catch (err) { return { success: false, outputPath: input.outputPath, targetDurationSeconds, sourceDurationSeconds: audioInfo.durationSeconds, - operation, - error: ffmpegResult.error, + operation: plan.operation, + error: `audioPadTrim: failed to materialize ${plan.operation}: ${ + err instanceof Error ? err.message : String(err) + }`, }; + } finally { + for (const path of plan.cleanupPaths) rmSync(path, { force: true }); } return { success: true, outputPath: input.outputPath, targetDurationSeconds, sourceDurationSeconds: audioInfo.durationSeconds, - operation, + operation: plan.operation, }; } @@ -307,7 +417,12 @@ function parseFrameRate(rate: string): { fpsNum: number; fpsDen: number } { async function defaultProbeAudioInfo(audioPath: string): Promise { // extractAudioMetadata is the shared ffprobe wrapper (caches results). const metadata: AudioMetadata = await extractAudioMetadata(audioPath); - return { durationSeconds: metadata.durationSeconds }; + return { + durationSeconds: metadata.durationSeconds, + sampleRate: metadata.sampleRate, + channels: metadata.channels, + audioCodec: metadata.audioCodec, + }; } async function defaultRunFfmpeg(args: string[]): Promise<{ success: boolean; error?: string }> { diff --git a/packages/producer/src/services/render/stages/assembleStage.ts b/packages/producer/src/services/render/stages/assembleStage.ts index 83ae671f45..c3e4ebc3e7 100644 --- a/packages/producer/src/services/render/stages/assembleStage.ts +++ b/packages/producer/src/services/render/stages/assembleStage.ts @@ -60,7 +60,7 @@ export async function runAssembleStage(input: AssembleStageInput): Promise Date: Sat, 20 Jun 2026 16:49:06 -0400 Subject: [PATCH 3/4] fix(engine): probe AAC sidecars before mux copy decision --- .../engine/src/services/chunkEncoder.test.ts | 73 +++++++++++++++++++ packages/engine/src/services/chunkEncoder.ts | 32 +++++++- 2 files changed, 102 insertions(+), 3 deletions(-) diff --git a/packages/engine/src/services/chunkEncoder.test.ts b/packages/engine/src/services/chunkEncoder.test.ts index fe6b1672f2..d7cd0a8b26 100644 --- a/packages/engine/src/services/chunkEncoder.test.ts +++ b/packages/engine/src/services/chunkEncoder.test.ts @@ -18,6 +18,7 @@ afterEach(() => { } vi.resetModules(); vi.doUnmock("child_process"); + vi.doUnmock("../utils/ffprobe.js"); vi.useRealTimers(); }); @@ -88,6 +89,11 @@ function emitClose(proc: FakeProc, code: number): void { proc.emit("close", code); } +async function flushMuxCodecResolution(): Promise { + await Promise.resolve(); + await Promise.resolve(); +} + describe("ENCODER_PRESETS", () => { it("has draft, standard, and high presets", () => { expect(ENCODER_PRESETS).toHaveProperty("draft"); @@ -371,6 +377,7 @@ describe("muxVideoWithAudio audio codec handling", () => { { num: 30, den: 1 }, ); + await flushMuxCodecResolution(); expect(calls).toHaveLength(1); expect(calls[0]!.args).toEqual([ "-i", @@ -415,6 +422,7 @@ describe("muxVideoWithAudio audio codec handling", () => { { num: 30, den: 1 }, ); + await flushMuxCodecResolution(); expect(calls).toHaveLength(1); expect(calls[0]!.args).toContain("-c:a"); expect(calls[0]!.args[calls[0]!.args.indexOf("-c:a") + 1]).toBe("copy"); @@ -428,6 +436,69 @@ describe("muxVideoWithAudio audio codec handling", () => { }); }); + it("probes unknown-extension AAC sidecars before choosing the MP4 copy path", async () => { + const { spawn, calls } = createSpawnSpy(); + const extractAudioMetadata = vi.fn(async () => ({ + durationSeconds: 1, + sampleRate: 48000, + channels: 2, + audioCodec: "aac", + })); + vi.resetModules(); + vi.doMock("child_process", () => ({ spawn })); + vi.doMock("../utils/ffprobe.js", () => ({ extractAudioMetadata })); + + const { muxVideoWithAudio } = await import("./chunkEncoder.js"); + const muxPromise = muxVideoWithAudio( + "/tmp/video-only.mp4", + "/tmp/audio-sidecar", + "/tmp/output.mp4", + ); + + await flushMuxCodecResolution(); + expect(extractAudioMetadata).toHaveBeenCalledWith("/tmp/audio-sidecar"); + expect(calls).toHaveLength(1); + expect(calls[0]!.args).toContain("-c:a"); + expect(calls[0]!.args[calls[0]!.args.indexOf("-c:a") + 1]).toBe("copy"); + expect(calls[0]!.args).not.toContain("-b:a"); + + emitClose(calls[0]!.proc, 0); + await expect(muxPromise).resolves.toMatchObject({ + success: true, + outputPath: "/tmp/output.mp4", + }); + }); + + it("keeps probed non-AAC unknown-extension sidecars on the MP4 transcode path", async () => { + const { spawn, calls } = createSpawnSpy(); + const extractAudioMetadata = vi.fn(async () => ({ + durationSeconds: 1, + sampleRate: 48000, + channels: 2, + audioCodec: "mp3", + })); + vi.resetModules(); + vi.doMock("child_process", () => ({ spawn })); + vi.doMock("../utils/ffprobe.js", () => ({ extractAudioMetadata })); + + const { muxVideoWithAudio } = await import("./chunkEncoder.js"); + const muxPromise = muxVideoWithAudio( + "/tmp/video-only.mp4", + "/tmp/audio-sidecar", + "/tmp/output.mp4", + ); + + await flushMuxCodecResolution(); + expect(extractAudioMetadata).toHaveBeenCalledWith("/tmp/audio-sidecar"); + expect(calls).toHaveLength(1); + expect(calls[0]!.args).toContain("-c:a"); + expect(calls[0]!.args[calls[0]!.args.indexOf("-c:a") + 1]).toBe("aac"); + expect(calls[0]!.args).toContain("-b:a"); + + emitClose(calls[0]!.proc, 0); + await expect(muxPromise).resolves.toMatchObject({ success: true }); + }); + it("still transcodes non-AAC audio when muxing MP4", async () => { const { spawn, calls } = createSpawnSpy(); vi.resetModules(); @@ -440,6 +511,7 @@ describe("muxVideoWithAudio audio codec handling", () => { "/tmp/output.mp4", ); + await flushMuxCodecResolution(); expect(calls).toHaveLength(1); expect(calls[0]!.args).toContain("-c:a"); expect(calls[0]!.args[calls[0]!.args.indexOf("-c:a") + 1]).toBe("aac"); @@ -462,6 +534,7 @@ describe("muxVideoWithAudio audio codec handling", () => { "/tmp/output.mov", ); + await flushMuxCodecResolution(); expect(calls).toHaveLength(1); expect(calls[0]!.args).toContain("-c:a"); expect(calls[0]!.args[calls[0]!.args.indexOf("-c:a") + 1]).toBe("copy"); diff --git a/packages/engine/src/services/chunkEncoder.ts b/packages/engine/src/services/chunkEncoder.ts index 18ecabd50e..2e21a97fd6 100644 --- a/packages/engine/src/services/chunkEncoder.ts +++ b/packages/engine/src/services/chunkEncoder.ts @@ -20,6 +20,7 @@ import { import { type HdrTransfer, getHdrEncoderColorParams } from "../utils/hdr.js"; import { formatFfmpegError, runFfmpeg } from "../utils/runFfmpeg.js"; import { getFfmpegBinary } from "../utils/ffmpegBinaries.js"; +import { extractAudioMetadata } from "../utils/ffprobe.js"; import { type Fps, fpsToFfmpegArg } from "@hyperframes/core"; import type { EncoderOptions, EncodeResult, MuxResult } from "./chunkEncoder.types.js"; @@ -48,6 +49,16 @@ function isAacSidecar(audioPath: string): boolean { return extname(audioPath).toLowerCase() === ".aac"; } +const KNOWN_NON_AAC_AUDIO_EXTENSIONS = new Set([ + ".flac", + ".mp3", + ".oga", + ".ogg", + ".opus", + ".wav", + ".webm", +]); + export interface MuxVideoWithAudioOptions extends Partial< Pick > { @@ -59,8 +70,23 @@ export interface MuxVideoWithAudioOptions extends Partial< audioCodec?: "aac"; } -function shouldCopyAacSidecar(audioPath: string, options: MuxVideoWithAudioOptions | undefined) { - return options?.audioCodec === "aac" || isAacSidecar(audioPath); +async function shouldCopyAacSidecar( + audioPath: string, + options: MuxVideoWithAudioOptions | undefined, +) { + if (options?.audioCodec === "aac" || isAacSidecar(audioPath)) return true; + + const audioExtension = extname(audioPath).toLowerCase(); + if (KNOWN_NON_AAC_AUDIO_EXTENSIONS.has(audioExtension)) return false; + + try { + const metadata = await extractAudioMetadata(audioPath); + return metadata.audioCodec === "aac"; + } catch { + // Preserve the pre-existing fallback for invalid or unprobeable sidecars: + // let the final ffmpeg transcode path surface the actionable mux error. + return false; + } } /** @@ -717,7 +743,7 @@ export async function muxVideoWithAudio( const isWebm = outputPath.endsWith(".webm"); const isMov = outputPath.endsWith(".mov"); - const shouldCopyAudio = shouldCopyAacSidecar(audioPath, config); + const shouldCopyAudio = isWebm ? false : await shouldCopyAacSidecar(audioPath, config); const args = ["-i", videoPath, "-i", audioPath, "-c:v", "copy"]; if (isWebm) { From 2f2bc2ed343a354b5cea242ecd267127025611bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miguel=20=C3=81ngel?= Date: Sat, 20 Jun 2026 16:57:45 -0400 Subject: [PATCH 4/4] fix(producer): avoid temp concat file for audio padding --- .../src/services/render/audioPadTrim.test.ts | 10 ++- .../src/services/render/audioPadTrim.ts | 70 +++++++++++++++---- 2 files changed, 59 insertions(+), 21 deletions(-) diff --git a/packages/producer/src/services/render/audioPadTrim.test.ts b/packages/producer/src/services/render/audioPadTrim.test.ts index 821537a65c..cbf62e9623 100644 --- a/packages/producer/src/services/render/audioPadTrim.test.ts +++ b/packages/producer/src/services/render/audioPadTrim.test.ts @@ -44,14 +44,12 @@ describe("buildPadTrimAudioArgs", () => { const concatArgs = plan.steps[1]!.args; expect(plan.steps[1]!.kind).toBe("pad-concat"); expect(concatArgs).toContain("concat"); + expect(concatArgs[concatArgs.indexOf("-i") + 1]).toBe("pipe:0"); expect(concatArgs[concatArgs.indexOf("-c:a") + 1]).toBe("copy"); expect(concatArgs[concatArgs.length - 1]).toBe("/tmp/out.aac"); - expect(plan.concatList?.contents).toContain("file '/tmp/in.aac'"); - expect(plan.concatList?.contents).toContain("file '/tmp/out.aac.pad-silence.aac'"); - expect(plan.cleanupPaths).toEqual([ - "/tmp/out.aac.pad-silence.aac", - "/tmp/out.aac.pad-concat.txt", - ]); + expect(plan.steps[1]!.stdin).toContain("file 'file:///tmp/in.aac'"); + expect(plan.steps[1]!.stdin).toContain("file 'file:///tmp/out.aac.pad-silence.aac'"); + expect(plan.cleanupPaths).toEqual(["/tmp/out.aac.pad-silence.aac"]); const reencodedSourceStep = plan.steps.find( (step) => diff --git a/packages/producer/src/services/render/audioPadTrim.ts b/packages/producer/src/services/render/audioPadTrim.ts index e94008ee04..b089b09a81 100644 --- a/packages/producer/src/services/render/audioPadTrim.ts +++ b/packages/producer/src/services/render/audioPadTrim.ts @@ -19,10 +19,12 @@ */ import { spawn } from "node:child_process"; -import { rmSync, writeFileSync } from "node:fs"; +import { rmSync } from "node:fs"; +import { pathToFileURL } from "node:url"; import { extractAudioMetadata, formatFfmpegError, + getFfmpegBinary, getFfprobeBinary, runFfmpeg, type AudioMetadata, @@ -68,7 +70,10 @@ export interface PadTrimAudioInput { */ probeVideoFrameInfo?: (videoPath: string) => Promise; probeAudioInfo?: (audioPath: string) => Promise; - runFfmpeg?: (args: string[]) => Promise<{ success: boolean; error?: string }>; + runFfmpeg?: ( + args: string[], + options?: { stdin?: string }, + ) => Promise<{ success: boolean; error?: string }>; } export type PadTrimOperation = "pad" | "trim" | "copy"; @@ -91,12 +96,12 @@ export type PadTrimAudioStepKind = "copy" | "trim" | "pad-silence" | "pad-concat export interface PadTrimAudioStep { kind: PadTrimAudioStepKind; args: string[]; + stdin?: string; } export interface PadTrimAudioPlan { operation: PadTrimOperation; steps: PadTrimAudioStep[]; - concatList?: { path: string; contents: string }; cleanupPaths: string[]; } @@ -133,7 +138,6 @@ export function buildPadTrimAudioPlan( if (delta > 0) { const padDur = formatSeconds(delta); const silencePath = `${outputPath}.pad-silence.aac`; - const concatListPath = `${outputPath}.pad-concat.txt`; return { operation: "pad", steps: [ @@ -161,20 +165,19 @@ export function buildPadTrimAudioPlan( "concat", "-safe", "0", + "-protocol_whitelist", + "file,pipe,crypto,data", "-i", - concatListPath, + "pipe:0", "-c:a", "copy", "-y", outputPath, ], + stdin: `${concatFileLine(audioPath)}\n${concatFileLine(silencePath)}\n`, }, ], - concatList: { - path: concatListPath, - contents: `${concatFileLine(audioPath)}\n${concatFileLine(silencePath)}\n`, - }, - cleanupPaths: [silencePath, concatListPath], + cleanupPaths: [silencePath], }; } // Trim. `-t` truncates AAC without re-encoding because AAC frames are @@ -228,7 +231,7 @@ function channelLayoutForChannels(channels: number | undefined): string { } function concatFileLine(path: string): string { - const normalized = path.replace(/\\/g, "/"); + const normalized = pathToFileURL(path).href; return `file '${normalized.replace(/'/g, "'\\''")}'`; } @@ -296,10 +299,8 @@ export async function padOrTrimAudioToVideoFrameCount( ); try { - if (plan.concatList) writeFileSync(plan.concatList.path, plan.concatList.contents, "utf-8"); - for (const step of plan.steps) { - const ffmpegResult = await runner(step.args); + const ffmpegResult = await runner(step.args, { stdin: step.stdin }); if (!ffmpegResult.success) { return { success: false, @@ -425,7 +426,12 @@ async function defaultProbeAudioInfo(audioPath: string): Promise }; } -async function defaultRunFfmpeg(args: string[]): Promise<{ success: boolean; error?: string }> { +async function defaultRunFfmpeg( + args: string[], + options?: { stdin?: string }, +): Promise<{ success: boolean; error?: string }> { + if (options?.stdin !== undefined) return runFfmpegWithStdin(args, options.stdin); + const result = await runFfmpeg(args); if (result.success) return { success: true }; return { @@ -434,6 +440,40 @@ async function defaultRunFfmpeg(args: string[]): Promise<{ success: boolean; err }; } +async function runFfmpegWithStdin( + args: string[], + stdin: string, +): Promise<{ success: boolean; error?: string }> { + return new Promise((resolve) => { + const proc = spawn(getFfmpegBinary(), args); + let stderr = ""; + + proc.stderr.on("data", (data: Buffer) => { + stderr += data.toString(); + }); + + proc.on("error", (err) => { + resolve({ + success: false, + error: `[audioPadTrim] ${err instanceof Error ? err.message : String(err)}`, + }); + }); + + proc.on("close", (code) => { + if (code === 0) { + resolve({ success: true }); + return; + } + resolve({ + success: false, + error: `[audioPadTrim] ${formatFfmpegError(code, stderr)}`, + }); + }); + + proc.stdin.end(stdin); + }); +} + // ── ffprobe JSON runner (shared between fast/slow video probe paths) ───── function runFfprobeJson(args: string[]): Promise {