From bffec67a6ded306b1109b985ec6a66b624a871af Mon Sep 17 00:00:00 2001 From: Seongho Bae Date: Sun, 5 Jul 2026 15:44:59 +0900 Subject: [PATCH 1/2] test(separation): characterize stem-separation quality against known sources MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The local stem separator is a frequency-band FFT heuristic, not neural source separation — but nothing measured how well it recovers a known source from a mixture (existing tests only check role keyword mapping, band routing of pure tones, and error handling). Add two characterization tests over a controlled ground-truth mix (harmonic-rich bass + vocal-band tone): - recovered bass stem SI-SDR stays below a clean-isolation bar (~9 dB measured; a neural model would exceed ~20 dB on a signal this simple) - a lone bass source leaks a meaningful energy share (~11%) into other stems, proving it splits by frequency band, not by source These pin current behaviour and act as a regression guard; the bounds should be re-baselined if a real separation model is introduced. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01RjGVapDZ3k7V7zKYk16P4C --- .../tests/test_separation_quality.py | 101 ++++++++++++++++++ 1 file changed, 101 insertions(+) create mode 100644 services/analysis-engine/tests/test_separation_quality.py diff --git a/services/analysis-engine/tests/test_separation_quality.py b/services/analysis-engine/tests/test_separation_quality.py new file mode 100644 index 00000000..e18963fc --- /dev/null +++ b/services/analysis-engine/tests/test_separation_quality.py @@ -0,0 +1,101 @@ +"""Separation-quality characterization for the local stem separator. + +These tests measure how well :class:`AudioStemSeparator` recovers *known* +source signals from a mixture, using signals whose ground truth we control. + +Important: the current separator is a frequency-band FFT heuristic, not a +neural source-separation model. It routes energy by frequency band, so a +real instrument (whose harmonics span several bands) is split across stems +rather than isolated. These tests pin that behaviour with concrete numbers. +They are characterization/regression guards, NOT a quality bar — if a real +model (e.g. demucs) is introduced, SI-SDR will rise well past these bounds +and these assertions should be re-baselined. +""" + +from __future__ import annotations + +import numpy as np +import soundfile as sf + +from bandscope_analysis.separation.audio_separator import ( + AudioSeparationConfig, + AudioStemSeparator, +) + +_SR = 22_050 + + +def _si_sdr(estimate: np.ndarray, reference: np.ndarray) -> float: + """Scale-invariant SDR in dB (higher = closer to the reference source).""" + estimate = estimate - float(np.mean(estimate)) + reference = reference - float(np.mean(reference)) + scale = float(np.dot(estimate, reference)) / (float(np.dot(reference, reference)) + 1e-12) + projection = scale * reference + noise = estimate - projection + return float( + 10.0 + * np.log10( + (float(np.dot(projection, projection)) + 1e-12) / (float(np.dot(noise, noise)) + 1e-12) + ) + ) + + +def _harmonic_bass(times: np.ndarray, f0: float = 82.0, harmonics: int = 8) -> np.ndarray: + """A bass note with harmonics that cross the separator's band boundaries.""" + signal = np.zeros_like(times) + for k in range(1, harmonics + 1): + signal += (1.0 / k) * np.sin(2 * np.pi * f0 * k * times) + return signal / float(np.max(np.abs(signal))) + + +def _config(duration_seconds: float) -> AudioSeparationConfig: + return AudioSeparationConfig( + target_sample_rate=_SR, + chunk_duration_seconds=duration_seconds, + max_duration_seconds=duration_seconds + 1.0, + max_file_bytes=8_000_000, + ) + + +def test_recovered_bass_is_not_high_fidelity_isolation(tmp_path) -> None: + """A real bass source is only partially recovered (band-split, not separation).""" + duration = 1.0 + times = np.arange(int(_SR * duration), dtype=np.float32) / _SR + true_bass = _harmonic_bass(times) + true_vocal = np.sin(2 * np.pi * 440.0 * times) + 0.5 * np.sin(2 * np.pi * 880.0 * times) + true_vocal /= float(np.max(np.abs(true_vocal))) + mix = (0.6 * true_bass + 0.6 * true_vocal).astype(np.float32) + + audio_path = tmp_path / "mix.wav" + sf.write(audio_path, mix, _SR) + + result = AudioStemSeparator(_config(duration)).separate(audio_path) + bass_stem = result["stems"]["bass"] + length = min(bass_stem.size, true_bass.size) + + si_sdr = _si_sdr(bass_stem[:length], (0.6 * true_bass[:length]).astype(np.float64)) + + # It keeps the in-band portion (positive) but is far from clean isolation. + # A neural separator would clear ~20 dB on a signal this simple; the band + # heuristic does not. Re-baseline this upper bound if a real model lands. + assert si_sdr > 0.0 + assert si_sdr < 18.0 + + +def test_bass_source_energy_leaks_across_stems(tmp_path) -> None: + """Harmonics of a single source spread across stems, proving band-splitting.""" + duration = 1.0 + times = np.arange(int(_SR * duration), dtype=np.float32) / _SR + bass_only = (0.6 * _harmonic_bass(times)).astype(np.float32) + + audio_path = tmp_path / "bass_only.wav" + sf.write(audio_path, bass_only, _SR) + + stems = AudioStemSeparator(_config(duration)).separate(audio_path)["stems"] + energy = {name: float(np.sum(stem.astype(np.float64) ** 2)) for name, stem in stems.items()} + total = sum(energy.values()) + 1e-12 + leaked_fraction = 1.0 - energy["bass"] / total + + # A true source separator would keep ~all of a lone bass in the bass stem. + # This heuristic leaks a meaningful share into other stems (measured ~11%). + assert leaked_fraction >= 0.05 From 6a5889cc6bf1ab22df8fae74d9e9d8f7ce1e92ff Mon Sep 17 00:00:00 2001 From: Seongho Bae Date: Sun, 5 Jul 2026 15:50:37 +0900 Subject: [PATCH 2/2] test(separation): add realistic-mix benchmark (negative mean SI-SDR) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On overlapping instruments (bass/keys/voice sharing bands) plus broadband drums, the band-split heuristic scores a NEGATIVE mean SI-SDR — for most stems the output is further from the true source than the mixture itself. Real neural separators are positive here (Demucs ~+9 dB, Open-Unmix ~+5 dB on MUSDB18). This pins that the current feature is not source separation. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01RjGVapDZ3k7V7zKYk16P4C --- .../tests/test_separation_quality.py | 49 +++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/services/analysis-engine/tests/test_separation_quality.py b/services/analysis-engine/tests/test_separation_quality.py index e18963fc..1bae48a3 100644 --- a/services/analysis-engine/tests/test_separation_quality.py +++ b/services/analysis-engine/tests/test_separation_quality.py @@ -99,3 +99,52 @@ def test_bass_source_energy_leaks_across_stems(tmp_path) -> None: # A true source separator would keep ~all of a lone bass in the bass stem. # This heuristic leaks a meaningful share into other stems (measured ~11%). assert leaked_fraction >= 0.05 + + +def test_realistic_mix_scores_far_below_usable_separation(tmp_path) -> None: + """On overlapping instruments + broadband drums, mean SI-SDR is negative. + + A realistic band (bass/keys/voice whose harmonics share bands, plus + broadband drums) is where frequency masking breaks down. Real neural + separators score a positive mean SI-SDR here (Demucs ~+9 dB, Open-Unmix + ~+5 dB on MUSDB18); this heuristic scores a *negative* mean, i.e. for + most stems the "separated" output is further from the true source than + the mixture itself. Re-baseline if a real model is introduced. + """ + duration = 2.0 + times = np.arange(int(_SR * duration), dtype=np.float32) / _SR + rng = np.random.default_rng(0) + + def _norm(signal: np.ndarray) -> np.ndarray: + return signal / (float(np.max(np.abs(signal))) + 1e-12) + + drums = np.zeros_like(times) + burst_len = int(0.12 * _SR) + for onset in np.arange(0.0, duration, 0.5): + start = int(onset * _SR) + burst = rng.standard_normal(burst_len) * np.exp(-np.linspace(0, 6, burst_len)) + end = min(start + burst_len, drums.size) + drums[start:end] += burst[: end - start] + + truth = { + "bass": _norm(_harmonic_bass(times, f0=55.0, harmonics=12)), + "other": _norm(_harmonic_bass(times, f0=220.0, harmonics=8)), + "vocals": _norm(_harmonic_bass(times, f0=261.0, harmonics=6)), + "drums": _norm(drums), + } + mix = np.sum([0.5 * source for source in truth.values()], axis=0).astype(np.float32) + + audio_path = tmp_path / "band.wav" + sf.write(audio_path, mix, _SR) + stems = AudioStemSeparator(_config(duration)).separate(audio_path)["stems"] + + scores = { + name: _si_sdr(stems[name][: times.size], (0.5 * truth[name]).astype(np.float64)) + for name in truth + } + mean_si_sdr = float(np.mean(list(scores.values()))) + + # Decisively below any usable separator (Demucs/Open-Unmix are positive). + assert mean_si_sdr < 0.0 + # Overlapping instruments contaminate the vocal band: worse than the mix. + assert scores["vocals"] < 0.0