From 4cb6a464ffbb85af63731fdad0ebe2fdb68df2f1 Mon Sep 17 00:00:00 2001
From: Alex <forkni@gmail.com>
Date: Sat, 6 Jun 2026 22:26:17 -0400
Subject: [PATCH 1/6] fix: drop failed LoRAs from engine cache signature to
 prevent mislabeled engines
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On load_lora_weights failure the entry was silently dropped from the merge
list but the original lora_dict remained, so get_engine_path() still built
a cache path with the failed LoRA's signature — producing a no-LoRA engine
permanently labelled as a LoRA engine (G1 cache-poisoning bug).

Changes:
- Track only successfully loaded adapters in _loaded_adapter_names dict
- After merge, reassign lora_dict to only the fused entries (or None if all
  failed) so engine paths always reflect actual UNet weight content
- Replace warn-and-continue on fuse_lora failure with RuntimeError: partial
  fusion leaves UNet weights ambiguous; a TRT engine built from that state
  is never correct and must not be silently cached
---
 src/streamdiffusion/wrapper.py | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/src/streamdiffusion/wrapper.py b/src/streamdiffusion/wrapper.py
index 36abe02b..89cb5d4d 100644
--- a/src/streamdiffusion/wrapper.py
+++ b/src/streamdiffusion/wrapper.py
@@ -1379,6 +1379,8 @@ def _load_model(
         # Load and properly merge LoRA weights using the standard diffusers approach
         lora_adapters_to_merge = []
         lora_scales_to_merge = []
+        # adapter_name → (lora_name, lora_scale) for only successfully loaded adapters (G1 fix)
+        _loaded_adapter_names: dict = {}
 
         # Collect all LoRA adapters and their scales from lora_dict
         if lora_dict is not None:
@@ -1391,10 +1393,11 @@ def _load_model(
                     stream.pipe.load_lora_weights(lora_name, adapter_name=adapter_name)
                     lora_adapters_to_merge.append(adapter_name)
                     lora_scales_to_merge.append(lora_scale)
+                    _loaded_adapter_names[adapter_name] = (lora_name, lora_scale)
                     logger.info(f"Successfully loaded LoRA adapter: {adapter_name}")
                 except Exception as e:
                     logger.error(f"Failed to load LoRA {lora_name}: {e}")
-                    # Continue with other LoRAs even if one fails
+                    # Drop this entry — do NOT carry it into the engine cache key (G1 fix)
                     continue
 
         # Merge all LoRA adapters using the proper diffusers method
@@ -1408,15 +1411,26 @@ def _load_model(
                 stream.pipe.unload_lora_weights()
                 logger.info("Successfully merged LoRAs individually")
 
-            except Exception as fallback_error:
-                logger.error(f"LoRA merging fallback also failed: {fallback_error}")
-                logger.warning("Continuing without LoRA merging - LoRAs may not be applied correctly")
-
-                # Clean up any partial state
+            except Exception as fuse_error:
+                # Partial fusion leaves UNet weights in an ambiguous state; baking a TRT engine
+                # from this state creates a permanently mislabeled or corrupted engine (G1 fix).
                 try:
                     stream.pipe.unload_lora_weights()
                 except Exception:
-                    pass
+                    logger.debug("LoRA cleanup: unload_lora_weights() failed after merge failure", exc_info=True)
+                raise RuntimeError(
+                    f"LoRA fusion failed — cannot build TRT engine with partial UNet state. Error: {fuse_error}"
+                ) from fuse_error
+
+        # G1 fix: Correct lora_dict to only contain successfully fused LoRAs so that
+        # get_engine_path() computes the correct engine cache signature.  Any LoRA that
+        # failed to load was never merged into UNet weights; the engine must NOT carry
+        # its signature in the cache path.
+        if lora_dict is not None:
+            fused_lora_dict = {
+                lora_name: lora_scale for _adapter, (lora_name, lora_scale) in _loaded_adapter_names.items()
+            }
+            lora_dict = fused_lora_dict if fused_lora_dict else None
 
         if use_tiny_vae:
             if vae_id is not None:

From 566bcee6d7aeb6127378728f6068a23cc27af06d Mon Sep 17 00:00:00 2001
From: Alex <forkni@gmail.com>
Date: Sun, 7 Jun 2026 06:13:19 -0400
Subject: [PATCH 2/6] fix: scope LoRA engine-cache suffix to UNet to avoid
 redundant VAE rebuilds

---
 .../acceleration/tensorrt/engine_manager.py                | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/streamdiffusion/acceleration/tensorrt/engine_manager.py b/src/streamdiffusion/acceleration/tensorrt/engine_manager.py
index ca01cc5c..da764d4d 100644
--- a/src/streamdiffusion/acceleration/tensorrt/engine_manager.py
+++ b/src/streamdiffusion/acceleration/tensorrt/engine_manager.py
@@ -153,8 +153,11 @@ def get_engine_path(
             if ipadapter_tokens is not None:
                 prefix += f"--tokens{ipadapter_tokens}"
 
-            # Fused Loras - use concise hashed signature to avoid long/invalid paths
-            if lora_dict is not None and len(lora_dict) > 0:
+            # Fused Loras - use concise hashed signature to avoid long/invalid paths.
+            # Only UNet engines bake LoRA weights; VAE and other standard engines are
+            # LoRA-agnostic, so scoping the suffix to UNET prevents redundant VAE rebuilds
+            # every time the LoRA dict changes.
+            if engine_type == EngineType.UNET and lora_dict is not None and len(lora_dict) > 0:
                 prefix += f"--lora-{self._lora_signature(lora_dict)}"
 
             if engine_type == EngineType.UNET:

From 41cd5c2d492e0b01cc5f7e24cfa2c9fa55a5b0ff Mon Sep 17 00:00:00 2001
From: Alex <forkni@gmail.com>
Date: Sun, 7 Jun 2026 07:47:33 -0400
Subject: [PATCH 3/6] fix: skip zero-scale LoRAs from fusion and engine cache
 signature

---
 src/streamdiffusion/wrapper.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/src/streamdiffusion/wrapper.py b/src/streamdiffusion/wrapper.py
index 89cb5d4d..9c2e17ef 100644
--- a/src/streamdiffusion/wrapper.py
+++ b/src/streamdiffusion/wrapper.py
@@ -1388,6 +1388,20 @@ def _load_model(
                 adapter_name = f"custom_lora_{i}"
                 logger.info(f"_load_model: Loading LoRA '{lora_name}' with scale {lora_scale}")
 
+                # G8 fix: scale-0 fuse is a mathematical no-op (W + 0·ΔW = W), so skip
+                # loading and fusing entirely.  The entry is also excluded from
+                # _loaded_adapter_names so the G1 block at the end of the loop naturally
+                # drops it from the engine cache signature — a lora_dict with only
+                # zero-scale entries collapses to None and reuses the baseline UNet engine.
+                # Note: negative scales are valid (subtract the LoRA delta), so skip == 0
+                # exactly, not <= 0.
+                if lora_scale == 0:
+                    logger.info(
+                        f"_load_model: Skipping zero-scale LoRA '{lora_name}' — "
+                        "no effect on weights; engine will match baseline cache"
+                    )
+                    continue
+
                 try:
                     # Load LoRA weights with unique adapter name
                     stream.pipe.load_lora_weights(lora_name, adapter_name=adapter_name)

From 4a6d2e5001612b4ba336472469885483aba4348f Mon Sep 17 00:00:00 2001
From: Alex <forkni@gmail.com>
Date: Sun, 7 Jun 2026 10:22:28 -0400
Subject: [PATCH 4/6] fix: route LoRA loading through offline-fallback helper
 (G4)

---
 src/streamdiffusion/wrapper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/streamdiffusion/wrapper.py b/src/streamdiffusion/wrapper.py
index 9c2e17ef..04c3c497 100644
--- a/src/streamdiffusion/wrapper.py
+++ b/src/streamdiffusion/wrapper.py
@@ -1404,7 +1404,7 @@ def _load_model(
 
                 try:
                     # Load LoRA weights with unique adapter name
-                    stream.pipe.load_lora_weights(lora_name, adapter_name=adapter_name)
+                    stream.load_lora(lora_name, adapter_name=adapter_name)
                     lora_adapters_to_merge.append(adapter_name)
                     lora_scales_to_merge.append(lora_scale)
                     _loaded_adapter_names[adapter_name] = (lora_name, lora_scale)

From 5c8e22ee355370e9d5e81b52603b51222e3160b0 Mon Sep 17 00:00:00 2001
From: Alex <forkni@gmail.com>
Date: Sun, 7 Jun 2026 06:15:50 -0400
Subject: [PATCH 5/6] perf: add per-engine VAE builder optimization level
 (default 3)

---
 src/streamdiffusion/config.py  |  4 ++--
 src/streamdiffusion/wrapper.py | 35 ++++++++++++++++++++++------------
 2 files changed, 25 insertions(+), 14 deletions(-)

diff --git a/src/streamdiffusion/config.py b/src/streamdiffusion/config.py
index 001fda3d..506ee864 100644
--- a/src/streamdiffusion/config.py
+++ b/src/streamdiffusion/config.py
@@ -130,9 +130,9 @@ def _extract_wrapper_params(config: Dict[str, Any]) -> Dict[str, Any]:
         'static_shapes': config.get('static_shapes', False),
         'fp8': config.get('fp8', False),
         'builder_optimization_level': config.get('builder_optimization_level'),
+        'vae_builder_optimization_level': config.get('vae_builder_optimization_level', 3),
         'build_engines_if_missing': config.get('build_engines_if_missing', True),
-        'fp8_allow_fp16_fallback': config.get('fp8_allow_fp16_fallback', False),
-    }
+        'fp8_allow_fp16_fallback': config.get('fp8_allow_fp16_fallback', False),    }
     if 'controlnets' in config and config['controlnets']:
         param_map['use_controlnet'] = True
         param_map['controlnet_config'] = _prepare_controlnet_configs(config)
diff --git a/src/streamdiffusion/wrapper.py b/src/streamdiffusion/wrapper.py
index 04c3c497..9384eb25 100644
--- a/src/streamdiffusion/wrapper.py
+++ b/src/streamdiffusion/wrapper.py
@@ -131,6 +131,7 @@ def __init__(
         static_shapes: bool = False,
         fp8_allow_fp16_fallback: bool = False,
         builder_optimization_level: Optional[int] = None,
+        vae_builder_optimization_level: Optional[int] = None,
     ):
         """
         Initializes the StreamDiffusionWrapper.
@@ -328,6 +329,10 @@ def __init__(
         self.static_shapes = static_shapes
         self.fp8_allow_fp16_fallback = fp8_allow_fp16_fallback
         self.builder_optimization_level = builder_optimization_level
+        # Per-engine VAE optlvl (None → inherit builder_optimization_level).
+        # Tiny-VAE engines are small and gain little from optlvl 4 — defaulting to
+        # optlvl 3 via config.py shaves VAE encoder build time without affecting UNet quality.
+        self.vae_builder_optimization_level = vae_builder_optimization_level
 
         self.stream: StreamDiffusion = self._load_model(
             model_id_or_path=model_id_or_path,
@@ -1596,6 +1601,12 @@ def _load_model(
                     resolution=(self.height, self.width),
                     builder_optimization_level=self.builder_optimization_level,
                 )
+                # Effective VAE optlvl: per-engine override first, then global fallback.
+                _vae_optlvl = (
+                    self.vae_builder_optimization_level
+                    if self.vae_builder_optimization_level is not None
+                    else self.builder_optimization_level
+                )
                 vae_encoder_path = engine_manager.get_engine_path(
                     EngineType.VAE_ENCODER,
                     model_id_or_path=model_id_or_path,
@@ -1608,7 +1619,7 @@ def _load_model(
                     ipadapter_tokens=ipadapter_tokens,
                     is_faceid=is_faceid if use_ipadapter_trt else None,
                     resolution=(self.height, self.width),
-                    builder_optimization_level=self.builder_optimization_level,
+                    builder_optimization_level=_vae_optlvl,
                 )
                 vae_decoder_path = engine_manager.get_engine_path(
                     EngineType.VAE_DECODER,
@@ -1622,7 +1633,7 @@ def _load_model(
                     ipadapter_tokens=ipadapter_tokens,
                     is_faceid=is_faceid if use_ipadapter_trt else None,
                     resolution=(self.height, self.width),
-                    builder_optimization_level=self.builder_optimization_level,
+                    builder_optimization_level=_vae_optlvl,
                 )
 
                 # Check if all required engines exist
@@ -1851,6 +1862,14 @@ def _load_model(
                             processors[name] = CachedSTAttnProcessor2_0()
                     stream.unet.set_attn_processor(processors)
 
+                # Effective VAE optlvl for both decoder and encoder compile calls.
+                # Mirrors the _vae_optlvl computed for get_engine_path above.
+                _vae_build_optlvl = (
+                    self.vae_builder_optimization_level
+                    if self.vae_builder_optimization_level is not None
+                    else self.builder_optimization_level
+                )
+
                 # Compile VAE decoder engine using EngineManager
                 vae_decoder_model = VAE(
                     device=self.device,
@@ -1877,11 +1896,7 @@ def _load_model(
                             if not self.static_shapes
                             else {}
                         ),
-                        **(
-                            {"builder_optimization_level": self.builder_optimization_level}
-                            if self.builder_optimization_level is not None
-                            else {}
-                        ),
+                        **({"builder_optimization_level": _vae_build_optlvl} if _vae_build_optlvl is not None else {}),
                     },
                 )
 
@@ -1911,11 +1926,7 @@ def _load_model(
                             if not self.static_shapes
                             else {}
                         ),
-                        **(
-                            {"builder_optimization_level": self.builder_optimization_level}
-                            if self.builder_optimization_level is not None
-                            else {}
-                        ),
+                        **({"builder_optimization_level": _vae_build_optlvl} if _vae_build_optlvl is not None else {}),
                     },
                 )
 

From bcfe20779c5bbd1af9697b4af1d85024eceba5b4 Mon Sep 17 00:00:00 2001
From: Alex <forkni@gmail.com>
Date: Sat, 6 Jun 2026 22:26:28 -0400
Subject: [PATCH 6/6] test: add LoRA sanity script for headless baseline vs
 LoRA comparison

Runs two StreamDiffusionWrapper passes (acceleration=none, same seed) and
saves baseline.png, lora.png, and a side-by-side comparison PNG.

Purpose: verify a LoRA loads on sdxl-turbo and is visibly effective at
2 denoising steps before paying the fp8 TRT engine build cost.

Default: nerijs/pixel-art-xl @ 1.0 (SDXL, strong style at 2 steps)
Usage:
  venv/Scripts/python scripts/test_lora_sanity.py
  venv/Scripts/python scripts/test_lora_sanity.py --lora not/a-real-lora
---
 scripts/test_lora_sanity.py | 211 ++++++++++++++++++++++++++++++++++++
 1 file changed, 211 insertions(+)
 create mode 100644 scripts/test_lora_sanity.py

diff --git a/scripts/test_lora_sanity.py b/scripts/test_lora_sanity.py
new file mode 100644
index 00000000..8d01ddeb
--- /dev/null
+++ b/scripts/test_lora_sanity.py
@@ -0,0 +1,211 @@
+"""
+LoRA Sanity Test — headless PyTorch baseline vs LoRA comparison.
+
+Runs two StreamDiffusionWrapper passes (acceleration='none') from the same seed:
+  A) baseline — no LoRA
+  B) lora     — with the requested LoRA fused at the given scale
+
+Saves baseline.png, lora.png, and a side-by-side comparison PNG.
+
+Purpose: confirm a LoRA loads correctly on sdxl-turbo and is visibly effective at
+2 denoising steps BEFORE paying the expensive fp8 TRT engine build.
+
+Usage:
+    venv/Scripts/python scripts/test_lora_sanity.py
+    venv/Scripts/python scripts/test_lora_sanity.py --lora nerijs/pixel-art-xl --weight 1.0
+    venv/Scripts/python scripts/test_lora_sanity.py --lora not/a-real-lora  # G1 error path test
+"""
+
+import argparse
+import logging
+import sys
+from pathlib import Path
+
+
+# ---------------------------------------------------------------------------
+# Repo root on sys.path so `from streamdiffusion` works without install
+# ---------------------------------------------------------------------------
+_REPO_ROOT = Path(__file__).resolve().parents[1]
+if str(_REPO_ROOT / "src") not in sys.path:
+    sys.path.insert(0, str(_REPO_ROOT / "src"))
+
+from PIL import Image  # noqa: E402
+
+from streamdiffusion import StreamDiffusionWrapper  # noqa: E402
+
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
+    datefmt="%H:%M:%S",
+)
+logger = logging.getLogger("lora_sanity")
+
+# ---------------------------------------------------------------------------
+# Defaults
+# ---------------------------------------------------------------------------
+DEFAULT_MODEL = "stabilityai/sdxl-turbo"
+DEFAULT_LORA = "nerijs/pixel-art-xl"
+DEFAULT_WEIGHT = 1.0
+DEFAULT_PROMPT = "pixel art style, a beautiful mountain landscape at sunset, detailed"
+DEFAULT_INPUT = str(_REPO_ROOT / "images" / "inputs" / "input.png")
+DEFAULT_T_INDEX = [10, 35]
+DEFAULT_SEED = 42
+DEFAULT_OUTPUT_DIR = str(_REPO_ROOT / "outputs" / "lora_sanity")
+
+
+# ---------------------------------------------------------------------------
+# Helper: run one inference pass with StreamDiffusionWrapper
+# ---------------------------------------------------------------------------
+def run_pass(
+    model_id: str,
+    prompt: str,
+    input_image: Image.Image,
+    t_index_list: list,
+    seed: int,
+    lora_dict: dict | None,
+    label: str,
+) -> Image.Image:
+    logger.info(f"--- [{label}] Building wrapper (acceleration=none) ---")
+    if lora_dict:
+        logger.info(f"    lora_dict = {lora_dict}")
+
+    stream = StreamDiffusionWrapper(
+        model_id_or_path=model_id,
+        t_index_list=t_index_list,
+        frame_buffer_size=1,
+        width=512,
+        height=512,
+        warmup=1,
+        acceleration="none",
+        mode="img2img",
+        use_denoising_batch=True,
+        cfg_type="self",
+        seed=seed,
+        use_tiny_vae=False,
+        lora_dict=lora_dict,
+    )
+
+    stream.prepare(
+        prompt=prompt,
+        negative_prompt="",
+        num_inference_steps=50,
+        guidance_scale=1.0,
+        delta=1.0,
+    )
+
+    image_tensor = stream.preprocess_image(input_image)
+
+    # Warmup: batch_size - 1 dummy passes (required by StreamDiffusion)
+    for _ in range(stream.batch_size - 1):
+        stream(image=image_tensor)
+
+    output = stream(image=image_tensor)
+    logger.info(f"    [{label}] Done — output type: {type(output)}")
+    return output
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+def main() -> int:
+    parser = argparse.ArgumentParser(description="LoRA sanity: baseline vs LoRA comparison")
+    parser.add_argument("--model", default=DEFAULT_MODEL, help="HF model id or local path")
+    parser.add_argument("--lora", default=DEFAULT_LORA, help="HF repo id or local .safetensors path")
+    parser.add_argument("--weight", type=float, default=DEFAULT_WEIGHT, help="LoRA scale (0–1)")
+    parser.add_argument("--prompt", default=DEFAULT_PROMPT, help="Text prompt (include trigger words)")
+    parser.add_argument("--input", default=DEFAULT_INPUT, help="Input image path")
+    parser.add_argument(
+        "--t-index",
+        nargs="+",
+        type=int,
+        default=DEFAULT_T_INDEX,
+        metavar="T",
+        help="t_index_list (e.g. --t-index 10 35)",
+    )
+    parser.add_argument("--seed", type=int, default=DEFAULT_SEED)
+    parser.add_argument("--output-dir", default=DEFAULT_OUTPUT_DIR, help="Directory for output PNGs")
+    args = parser.parse_args()
+
+    # ------------------------------------------------------------------
+    # Prepare output directory and input image
+    # ------------------------------------------------------------------
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    input_path = Path(args.input)
+    if not input_path.exists():
+        logger.error(f"Input image not found: {input_path}")
+        return 1
+
+    input_image = Image.open(input_path).convert("RGB").resize((512, 512))
+    logger.info(f"Input image: {input_path} → resized to 512×512")
+
+    lora_dict = {args.lora: args.weight}
+    t_index_list = args.t_index
+
+    # ------------------------------------------------------------------
+    # Run A: Baseline (no LoRA)
+    # ------------------------------------------------------------------
+    logger.info("=" * 60)
+    logger.info("RUN A: baseline (no LoRA)")
+    logger.info("=" * 60)
+    baseline_img = run_pass(
+        model_id=args.model,
+        prompt=args.prompt,
+        input_image=input_image,
+        t_index_list=t_index_list,
+        seed=args.seed,
+        lora_dict=None,
+        label="baseline",
+    )
+    baseline_path = output_dir / "baseline.png"
+    baseline_img.save(baseline_path)
+    logger.info(f"Saved baseline: {baseline_path}")
+
+    # ------------------------------------------------------------------
+    # Run B: LoRA
+    # ------------------------------------------------------------------
+    logger.info("=" * 60)
+    logger.info(f"RUN B: LoRA={args.lora} @ {args.weight}")
+    logger.info("=" * 60)
+    try:
+        lora_img = run_pass(
+            model_id=args.model,
+            prompt=args.prompt,
+            input_image=input_image,
+            t_index_list=t_index_list,
+            seed=args.seed,
+            lora_dict=lora_dict,
+            label="lora",
+        )
+    except RuntimeError as e:
+        logger.error(f"LoRA run failed (expected for invalid LoRA ids): {e}")
+        logger.info("Baseline image saved. LoRA run aborted cleanly (G1 fix working correctly).")
+        return 2
+
+    lora_path = output_dir / "lora.png"
+    lora_img.save(lora_path)
+    logger.info(f"Saved lora: {lora_path}")
+
+    # ------------------------------------------------------------------
+    # Side-by-side comparison
+    # ------------------------------------------------------------------
+    comparison = Image.new("RGB", (1024, 512))
+    comparison.paste(baseline_img.resize((512, 512)), (0, 0))
+    comparison.paste(lora_img.resize((512, 512)), (512, 0))
+    comparison_path = output_dir / "comparison.png"
+    comparison.save(comparison_path)
+    logger.info(f"Saved side-by-side: {comparison_path}")
+
+    logger.info("=" * 60)
+    logger.info("DONE. Inspect outputs:")
+    logger.info(f"  baseline:   {baseline_path}")
+    logger.info(f"  lora:       {lora_path}")
+    logger.info(f"  comparison: {comparison_path}")
+    logger.info("=" * 60)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())