dotsimulate · forkni · Jun 5, 2026 · Jun 5, 2026 · Jun 5, 2026 · Jun 5, 2026
diff --git a/configs/td_config.yaml.example b/configs/td_config.yaml.example
@@ -78,6 +78,12 @@ engine_dir: "engines/td"
 
 # ControlNet configuration (disabled)
 use_controlnet: false
+# cn_cache_interval: reuse CN residuals every N frames instead of recomputing each frame.
+# 1 = disabled (default, always recompute). 2+ = skip forward on intermediate frames.
+# Safe to change live; invalidated automatically on control-image or scale change.
+# Note: cache key does NOT include t_index_list — avoid changing batch config mid-stream
+# while caching is active. Low practical risk but noted.
+cn_cache_interval: 1
 
 # IPAdapter configuration (disabled)
 use_ipadapter: false

diff --git a/demo/realtime-img2img/controlnet_registry.yaml b/demo/realtime-img2img/controlnet_registry.yaml
@@ -55,7 +55,7 @@ available_controlnets:
     - id: "tile_sd15"
       name: "Tile/Feedback"
       model_id: "lllyasviel/control_v11f1e_sd15_tile"
-      default_preprocessor: "feedback"
+      default_preprocessor: "passthrough"
       default_scale: 0.6
       description: "Uses image feedback for enhanced details"
       preprocessor_params:
@@ -116,8 +116,27 @@ available_controlnets:
     - id: "tile_sdxl"
       name: "Tile/Feedback"
       model_id: "xinsir/controlnet-tile-sdxl-1.0"
-      default_preprocessor: "feedback" 
+      default_preprocessor: "passthrough"
       default_scale: 0.6
       description: "Uses image feedback for enhanced details (SDXL)"
+      preprocessor_params:
+        image_resolution: 512
+
+    - id: "depth_xinsir_sdxl"
+      name: "Depth Detection (xinsir)"
+      model_id: "xinsir/controlnet-depth-sdxl-1.0"
+      default_preprocessor: "depth_tensorrt"
+      default_scale: 0.8
+      description: "Estimates depth information from images — xinsir SDXL variant"
+      preprocessor_params:
+        detect_resolution: 518
+        image_resolution: 512
+
+    - id: "scribble_sdxl"
+      name: "Scribble"
+      model_id: "xinsir/controlnet-scribble-sdxl-1.0"
+      default_preprocessor: "scribble_tensorrt"
+      default_scale: 0.8
+      description: "Produces sketch-like scribble edge conditioning (SDXL)"
       preprocessor_params:
         image_resolution: 512
diff --git a/demo/realtime-img2img/util.py b/demo/realtime-img2img/util.py
@@ -29,21 +29,31 @@ def bytes_to_pil(image_bytes: bytes) -> Image.Image:
 
 def bytes_to_pt(image_bytes: bytes) -> torch.Tensor:
     """
-    Convert JPEG/PNG bytes directly to PyTorch tensor using torchvision
-
+    Convert JPEG bytes directly to a GPU float32 tensor via torchvision nvJPEG.
+
+    Decodes on CUDA when available (nvJPEG path), eliminating the CPU decode +
+    host→device DMA transfer that the CPU path incurs.  Falls back to CPU decode
+    on machines without CUDA.
+
     Args:
-        image_bytes: Raw image bytes (JPEG/PNG format)
-
+        image_bytes: Raw JPEG bytes (PNG bytes fall back to CPU automatically
+                     since nvJPEG only handles JPEG)
+
+
     Returns:
-        torch.Tensor: Image tensor with shape (C, H, W), values in [0, 1], dtype float32
+        torch.Tensor: Image tensor with shape (C, H, W), values in [0, 1],
+                      dtype float32, on the same device as the decode.
     """
-    # Convert bytes to tensor for torchvision
     byte_tensor = torch.frombuffer(image_bytes, dtype=torch.uint8)
-
-    # Decode JPEG/PNG directly to tensor (C, H, W) format, uint8 [0, 255]
-    image_tensor = decode_jpeg(byte_tensor)
-
-    # Convert to float32 and normalize to [0, 1]
+
+    # Decode directly on GPU when CUDA is available — nvJPEG avoids the
+    # CPU decode + H2D copy incurred by the plain decode_jpeg(byte_tensor) call.
+    if torch.cuda.is_available():
+        image_tensor = decode_jpeg(byte_tensor, device="cuda")
+    else:
+        image_tensor = decode_jpeg(byte_tensor)
+
+    # Normalise to [0, 1] on the decode device (fused kernel on GPU).
     image_tensor = image_tensor.float() / 255.0
 
     return image_tensor