From 4cb6a464ffbb85af63731fdad0ebe2fdb68df2f1 Mon Sep 17 00:00:00 2001 From: Alex Date: Sat, 6 Jun 2026 22:26:17 -0400 Subject: [PATCH 1/6] fix: drop failed LoRAs from engine cache signature to prevent mislabeled engines MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On load_lora_weights failure the entry was silently dropped from the merge list but the original lora_dict remained, so get_engine_path() still built a cache path with the failed LoRA's signature — producing a no-LoRA engine permanently labelled as a LoRA engine (G1 cache-poisoning bug). Changes: - Track only successfully loaded adapters in _loaded_adapter_names dict - After merge, reassign lora_dict to only the fused entries (or None if all failed) so engine paths always reflect actual UNet weight content - Replace warn-and-continue on fuse_lora failure with RuntimeError: partial fusion leaves UNet weights ambiguous; a TRT engine built from that state is never correct and must not be silently cached --- src/streamdiffusion/wrapper.py | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/src/streamdiffusion/wrapper.py b/src/streamdiffusion/wrapper.py index 36abe02b..89cb5d4d 100644 --- a/src/streamdiffusion/wrapper.py +++ b/src/streamdiffusion/wrapper.py @@ -1379,6 +1379,8 @@ def _load_model( # Load and properly merge LoRA weights using the standard diffusers approach lora_adapters_to_merge = [] lora_scales_to_merge = [] + # adapter_name → (lora_name, lora_scale) for only successfully loaded adapters (G1 fix) + _loaded_adapter_names: dict = {} # Collect all LoRA adapters and their scales from lora_dict if lora_dict is not None: @@ -1391,10 +1393,11 @@ def _load_model( stream.pipe.load_lora_weights(lora_name, adapter_name=adapter_name) lora_adapters_to_merge.append(adapter_name) lora_scales_to_merge.append(lora_scale) + _loaded_adapter_names[adapter_name] = (lora_name, lora_scale) logger.info(f"Successfully loaded LoRA adapter: {adapter_name}") except Exception as e: logger.error(f"Failed to load LoRA {lora_name}: {e}") - # Continue with other LoRAs even if one fails + # Drop this entry — do NOT carry it into the engine cache key (G1 fix) continue # Merge all LoRA adapters using the proper diffusers method @@ -1408,15 +1411,26 @@ def _load_model( stream.pipe.unload_lora_weights() logger.info("Successfully merged LoRAs individually") - except Exception as fallback_error: - logger.error(f"LoRA merging fallback also failed: {fallback_error}") - logger.warning("Continuing without LoRA merging - LoRAs may not be applied correctly") - - # Clean up any partial state + except Exception as fuse_error: + # Partial fusion leaves UNet weights in an ambiguous state; baking a TRT engine + # from this state creates a permanently mislabeled or corrupted engine (G1 fix). try: stream.pipe.unload_lora_weights() except Exception: - pass + logger.debug("LoRA cleanup: unload_lora_weights() failed after merge failure", exc_info=True) + raise RuntimeError( + f"LoRA fusion failed — cannot build TRT engine with partial UNet state. Error: {fuse_error}" + ) from fuse_error + + # G1 fix: Correct lora_dict to only contain successfully fused LoRAs so that + # get_engine_path() computes the correct engine cache signature. Any LoRA that + # failed to load was never merged into UNet weights; the engine must NOT carry + # its signature in the cache path. + if lora_dict is not None: + fused_lora_dict = { + lora_name: lora_scale for _adapter, (lora_name, lora_scale) in _loaded_adapter_names.items() + } + lora_dict = fused_lora_dict if fused_lora_dict else None if use_tiny_vae: if vae_id is not None: From 566bcee6d7aeb6127378728f6068a23cc27af06d Mon Sep 17 00:00:00 2001 From: Alex Date: Sun, 7 Jun 2026 06:13:19 -0400 Subject: [PATCH 2/6] fix: scope LoRA engine-cache suffix to UNet to avoid redundant VAE rebuilds --- .../acceleration/tensorrt/engine_manager.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/streamdiffusion/acceleration/tensorrt/engine_manager.py b/src/streamdiffusion/acceleration/tensorrt/engine_manager.py index ca01cc5c..da764d4d 100644 --- a/src/streamdiffusion/acceleration/tensorrt/engine_manager.py +++ b/src/streamdiffusion/acceleration/tensorrt/engine_manager.py @@ -153,8 +153,11 @@ def get_engine_path( if ipadapter_tokens is not None: prefix += f"--tokens{ipadapter_tokens}" - # Fused Loras - use concise hashed signature to avoid long/invalid paths - if lora_dict is not None and len(lora_dict) > 0: + # Fused Loras - use concise hashed signature to avoid long/invalid paths. + # Only UNet engines bake LoRA weights; VAE and other standard engines are + # LoRA-agnostic, so scoping the suffix to UNET prevents redundant VAE rebuilds + # every time the LoRA dict changes. + if engine_type == EngineType.UNET and lora_dict is not None and len(lora_dict) > 0: prefix += f"--lora-{self._lora_signature(lora_dict)}" if engine_type == EngineType.UNET: From 41cd5c2d492e0b01cc5f7e24cfa2c9fa55a5b0ff Mon Sep 17 00:00:00 2001 From: Alex Date: Sun, 7 Jun 2026 07:47:33 -0400 Subject: [PATCH 3/6] fix: skip zero-scale LoRAs from fusion and engine cache signature --- src/streamdiffusion/wrapper.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/streamdiffusion/wrapper.py b/src/streamdiffusion/wrapper.py index 89cb5d4d..9c2e17ef 100644 --- a/src/streamdiffusion/wrapper.py +++ b/src/streamdiffusion/wrapper.py @@ -1388,6 +1388,20 @@ def _load_model( adapter_name = f"custom_lora_{i}" logger.info(f"_load_model: Loading LoRA '{lora_name}' with scale {lora_scale}") + # G8 fix: scale-0 fuse is a mathematical no-op (W + 0·ΔW = W), so skip + # loading and fusing entirely. The entry is also excluded from + # _loaded_adapter_names so the G1 block at the end of the loop naturally + # drops it from the engine cache signature — a lora_dict with only + # zero-scale entries collapses to None and reuses the baseline UNet engine. + # Note: negative scales are valid (subtract the LoRA delta), so skip == 0 + # exactly, not <= 0. + if lora_scale == 0: + logger.info( + f"_load_model: Skipping zero-scale LoRA '{lora_name}' — " + "no effect on weights; engine will match baseline cache" + ) + continue + try: # Load LoRA weights with unique adapter name stream.pipe.load_lora_weights(lora_name, adapter_name=adapter_name) From 4a6d2e5001612b4ba336472469885483aba4348f Mon Sep 17 00:00:00 2001 From: Alex Date: Sun, 7 Jun 2026 10:22:28 -0400 Subject: [PATCH 4/6] fix: route LoRA loading through offline-fallback helper (G4) --- src/streamdiffusion/wrapper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/streamdiffusion/wrapper.py b/src/streamdiffusion/wrapper.py index 9c2e17ef..04c3c497 100644 --- a/src/streamdiffusion/wrapper.py +++ b/src/streamdiffusion/wrapper.py @@ -1404,7 +1404,7 @@ def _load_model( try: # Load LoRA weights with unique adapter name - stream.pipe.load_lora_weights(lora_name, adapter_name=adapter_name) + stream.load_lora(lora_name, adapter_name=adapter_name) lora_adapters_to_merge.append(adapter_name) lora_scales_to_merge.append(lora_scale) _loaded_adapter_names[adapter_name] = (lora_name, lora_scale) From 5c8e22ee355370e9d5e81b52603b51222e3160b0 Mon Sep 17 00:00:00 2001 From: Alex Date: Sun, 7 Jun 2026 06:15:50 -0400 Subject: [PATCH 5/6] perf: add per-engine VAE builder optimization level (default 3) --- src/streamdiffusion/config.py | 4 ++-- src/streamdiffusion/wrapper.py | 35 ++++++++++++++++++++++------------ 2 files changed, 25 insertions(+), 14 deletions(-) diff --git a/src/streamdiffusion/config.py b/src/streamdiffusion/config.py index 001fda3d..506ee864 100644 --- a/src/streamdiffusion/config.py +++ b/src/streamdiffusion/config.py @@ -130,9 +130,9 @@ def _extract_wrapper_params(config: Dict[str, Any]) -> Dict[str, Any]: 'static_shapes': config.get('static_shapes', False), 'fp8': config.get('fp8', False), 'builder_optimization_level': config.get('builder_optimization_level'), + 'vae_builder_optimization_level': config.get('vae_builder_optimization_level', 3), 'build_engines_if_missing': config.get('build_engines_if_missing', True), - 'fp8_allow_fp16_fallback': config.get('fp8_allow_fp16_fallback', False), - } + 'fp8_allow_fp16_fallback': config.get('fp8_allow_fp16_fallback', False), } if 'controlnets' in config and config['controlnets']: param_map['use_controlnet'] = True param_map['controlnet_config'] = _prepare_controlnet_configs(config) diff --git a/src/streamdiffusion/wrapper.py b/src/streamdiffusion/wrapper.py index 04c3c497..9384eb25 100644 --- a/src/streamdiffusion/wrapper.py +++ b/src/streamdiffusion/wrapper.py @@ -131,6 +131,7 @@ def __init__( static_shapes: bool = False, fp8_allow_fp16_fallback: bool = False, builder_optimization_level: Optional[int] = None, + vae_builder_optimization_level: Optional[int] = None, ): """ Initializes the StreamDiffusionWrapper. @@ -328,6 +329,10 @@ def __init__( self.static_shapes = static_shapes self.fp8_allow_fp16_fallback = fp8_allow_fp16_fallback self.builder_optimization_level = builder_optimization_level + # Per-engine VAE optlvl (None → inherit builder_optimization_level). + # Tiny-VAE engines are small and gain little from optlvl 4 — defaulting to + # optlvl 3 via config.py shaves VAE encoder build time without affecting UNet quality. + self.vae_builder_optimization_level = vae_builder_optimization_level self.stream: StreamDiffusion = self._load_model( model_id_or_path=model_id_or_path, @@ -1596,6 +1601,12 @@ def _load_model( resolution=(self.height, self.width), builder_optimization_level=self.builder_optimization_level, ) + # Effective VAE optlvl: per-engine override first, then global fallback. + _vae_optlvl = ( + self.vae_builder_optimization_level + if self.vae_builder_optimization_level is not None + else self.builder_optimization_level + ) vae_encoder_path = engine_manager.get_engine_path( EngineType.VAE_ENCODER, model_id_or_path=model_id_or_path, @@ -1608,7 +1619,7 @@ def _load_model( ipadapter_tokens=ipadapter_tokens, is_faceid=is_faceid if use_ipadapter_trt else None, resolution=(self.height, self.width), - builder_optimization_level=self.builder_optimization_level, + builder_optimization_level=_vae_optlvl, ) vae_decoder_path = engine_manager.get_engine_path( EngineType.VAE_DECODER, @@ -1622,7 +1633,7 @@ def _load_model( ipadapter_tokens=ipadapter_tokens, is_faceid=is_faceid if use_ipadapter_trt else None, resolution=(self.height, self.width), - builder_optimization_level=self.builder_optimization_level, + builder_optimization_level=_vae_optlvl, ) # Check if all required engines exist @@ -1851,6 +1862,14 @@ def _load_model( processors[name] = CachedSTAttnProcessor2_0() stream.unet.set_attn_processor(processors) + # Effective VAE optlvl for both decoder and encoder compile calls. + # Mirrors the _vae_optlvl computed for get_engine_path above. + _vae_build_optlvl = ( + self.vae_builder_optimization_level + if self.vae_builder_optimization_level is not None + else self.builder_optimization_level + ) + # Compile VAE decoder engine using EngineManager vae_decoder_model = VAE( device=self.device, @@ -1877,11 +1896,7 @@ def _load_model( if not self.static_shapes else {} ), - **( - {"builder_optimization_level": self.builder_optimization_level} - if self.builder_optimization_level is not None - else {} - ), + **({"builder_optimization_level": _vae_build_optlvl} if _vae_build_optlvl is not None else {}), }, ) @@ -1911,11 +1926,7 @@ def _load_model( if not self.static_shapes else {} ), - **( - {"builder_optimization_level": self.builder_optimization_level} - if self.builder_optimization_level is not None - else {} - ), + **({"builder_optimization_level": _vae_build_optlvl} if _vae_build_optlvl is not None else {}), }, ) From bcfe20779c5bbd1af9697b4af1d85024eceba5b4 Mon Sep 17 00:00:00 2001 From: Alex Date: Sat, 6 Jun 2026 22:26:28 -0400 Subject: [PATCH 6/6] test: add LoRA sanity script for headless baseline vs LoRA comparison Runs two StreamDiffusionWrapper passes (acceleration=none, same seed) and saves baseline.png, lora.png, and a side-by-side comparison PNG. Purpose: verify a LoRA loads on sdxl-turbo and is visibly effective at 2 denoising steps before paying the fp8 TRT engine build cost. Default: nerijs/pixel-art-xl @ 1.0 (SDXL, strong style at 2 steps) Usage: venv/Scripts/python scripts/test_lora_sanity.py venv/Scripts/python scripts/test_lora_sanity.py --lora not/a-real-lora --- scripts/test_lora_sanity.py | 211 ++++++++++++++++++++++++++++++++++++ 1 file changed, 211 insertions(+) create mode 100644 scripts/test_lora_sanity.py diff --git a/scripts/test_lora_sanity.py b/scripts/test_lora_sanity.py new file mode 100644 index 00000000..8d01ddeb --- /dev/null +++ b/scripts/test_lora_sanity.py @@ -0,0 +1,211 @@ +""" +LoRA Sanity Test — headless PyTorch baseline vs LoRA comparison. + +Runs two StreamDiffusionWrapper passes (acceleration='none') from the same seed: + A) baseline — no LoRA + B) lora — with the requested LoRA fused at the given scale + +Saves baseline.png, lora.png, and a side-by-side comparison PNG. + +Purpose: confirm a LoRA loads correctly on sdxl-turbo and is visibly effective at +2 denoising steps BEFORE paying the expensive fp8 TRT engine build. + +Usage: + venv/Scripts/python scripts/test_lora_sanity.py + venv/Scripts/python scripts/test_lora_sanity.py --lora nerijs/pixel-art-xl --weight 1.0 + venv/Scripts/python scripts/test_lora_sanity.py --lora not/a-real-lora # G1 error path test +""" + +import argparse +import logging +import sys +from pathlib import Path + + +# --------------------------------------------------------------------------- +# Repo root on sys.path so `from streamdiffusion` works without install +# --------------------------------------------------------------------------- +_REPO_ROOT = Path(__file__).resolve().parents[1] +if str(_REPO_ROOT / "src") not in sys.path: + sys.path.insert(0, str(_REPO_ROOT / "src")) + +from PIL import Image # noqa: E402 + +from streamdiffusion import StreamDiffusionWrapper # noqa: E402 + + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", + datefmt="%H:%M:%S", +) +logger = logging.getLogger("lora_sanity") + +# --------------------------------------------------------------------------- +# Defaults +# --------------------------------------------------------------------------- +DEFAULT_MODEL = "stabilityai/sdxl-turbo" +DEFAULT_LORA = "nerijs/pixel-art-xl" +DEFAULT_WEIGHT = 1.0 +DEFAULT_PROMPT = "pixel art style, a beautiful mountain landscape at sunset, detailed" +DEFAULT_INPUT = str(_REPO_ROOT / "images" / "inputs" / "input.png") +DEFAULT_T_INDEX = [10, 35] +DEFAULT_SEED = 42 +DEFAULT_OUTPUT_DIR = str(_REPO_ROOT / "outputs" / "lora_sanity") + + +# --------------------------------------------------------------------------- +# Helper: run one inference pass with StreamDiffusionWrapper +# --------------------------------------------------------------------------- +def run_pass( + model_id: str, + prompt: str, + input_image: Image.Image, + t_index_list: list, + seed: int, + lora_dict: dict | None, + label: str, +) -> Image.Image: + logger.info(f"--- [{label}] Building wrapper (acceleration=none) ---") + if lora_dict: + logger.info(f" lora_dict = {lora_dict}") + + stream = StreamDiffusionWrapper( + model_id_or_path=model_id, + t_index_list=t_index_list, + frame_buffer_size=1, + width=512, + height=512, + warmup=1, + acceleration="none", + mode="img2img", + use_denoising_batch=True, + cfg_type="self", + seed=seed, + use_tiny_vae=False, + lora_dict=lora_dict, + ) + + stream.prepare( + prompt=prompt, + negative_prompt="", + num_inference_steps=50, + guidance_scale=1.0, + delta=1.0, + ) + + image_tensor = stream.preprocess_image(input_image) + + # Warmup: batch_size - 1 dummy passes (required by StreamDiffusion) + for _ in range(stream.batch_size - 1): + stream(image=image_tensor) + + output = stream(image=image_tensor) + logger.info(f" [{label}] Done — output type: {type(output)}") + return output + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- +def main() -> int: + parser = argparse.ArgumentParser(description="LoRA sanity: baseline vs LoRA comparison") + parser.add_argument("--model", default=DEFAULT_MODEL, help="HF model id or local path") + parser.add_argument("--lora", default=DEFAULT_LORA, help="HF repo id or local .safetensors path") + parser.add_argument("--weight", type=float, default=DEFAULT_WEIGHT, help="LoRA scale (0–1)") + parser.add_argument("--prompt", default=DEFAULT_PROMPT, help="Text prompt (include trigger words)") + parser.add_argument("--input", default=DEFAULT_INPUT, help="Input image path") + parser.add_argument( + "--t-index", + nargs="+", + type=int, + default=DEFAULT_T_INDEX, + metavar="T", + help="t_index_list (e.g. --t-index 10 35)", + ) + parser.add_argument("--seed", type=int, default=DEFAULT_SEED) + parser.add_argument("--output-dir", default=DEFAULT_OUTPUT_DIR, help="Directory for output PNGs") + args = parser.parse_args() + + # ------------------------------------------------------------------ + # Prepare output directory and input image + # ------------------------------------------------------------------ + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + input_path = Path(args.input) + if not input_path.exists(): + logger.error(f"Input image not found: {input_path}") + return 1 + + input_image = Image.open(input_path).convert("RGB").resize((512, 512)) + logger.info(f"Input image: {input_path} → resized to 512×512") + + lora_dict = {args.lora: args.weight} + t_index_list = args.t_index + + # ------------------------------------------------------------------ + # Run A: Baseline (no LoRA) + # ------------------------------------------------------------------ + logger.info("=" * 60) + logger.info("RUN A: baseline (no LoRA)") + logger.info("=" * 60) + baseline_img = run_pass( + model_id=args.model, + prompt=args.prompt, + input_image=input_image, + t_index_list=t_index_list, + seed=args.seed, + lora_dict=None, + label="baseline", + ) + baseline_path = output_dir / "baseline.png" + baseline_img.save(baseline_path) + logger.info(f"Saved baseline: {baseline_path}") + + # ------------------------------------------------------------------ + # Run B: LoRA + # ------------------------------------------------------------------ + logger.info("=" * 60) + logger.info(f"RUN B: LoRA={args.lora} @ {args.weight}") + logger.info("=" * 60) + try: + lora_img = run_pass( + model_id=args.model, + prompt=args.prompt, + input_image=input_image, + t_index_list=t_index_list, + seed=args.seed, + lora_dict=lora_dict, + label="lora", + ) + except RuntimeError as e: + logger.error(f"LoRA run failed (expected for invalid LoRA ids): {e}") + logger.info("Baseline image saved. LoRA run aborted cleanly (G1 fix working correctly).") + return 2 + + lora_path = output_dir / "lora.png" + lora_img.save(lora_path) + logger.info(f"Saved lora: {lora_path}") + + # ------------------------------------------------------------------ + # Side-by-side comparison + # ------------------------------------------------------------------ + comparison = Image.new("RGB", (1024, 512)) + comparison.paste(baseline_img.resize((512, 512)), (0, 0)) + comparison.paste(lora_img.resize((512, 512)), (512, 0)) + comparison_path = output_dir / "comparison.png" + comparison.save(comparison_path) + logger.info(f"Saved side-by-side: {comparison_path}") + + logger.info("=" * 60) + logger.info("DONE. Inspect outputs:") + logger.info(f" baseline: {baseline_path}") + logger.info(f" lora: {lora_path}") + logger.info(f" comparison: {comparison_path}") + logger.info("=" * 60) + return 0 + + +if __name__ == "__main__": + sys.exit(main())