m96-chan · m96-chan · Jan 1, 2026 · Dec 31, 2025 · Dec 31, 2025 · Dec 31, 2025
diff --git a/README.md b/README.md
@@ -99,6 +99,114 @@ They were all observed in production or real benchmarks.
 
 ---
 
+## What's New in v0.2.19
+
+### FLUX.1 Image Generation
+Text-to-image generation with Black Forest Labs' FLUX.1 model:
+
+```python
+from pygpukit.diffusion import FluxPipeline
+
+# Load FLUX.1-schnell (fast variant)
+pipeline = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell")
+
+# Generate image
+image = pipeline.generate(
+    prompt="a photo of a cat sitting on a windowsill",
+    num_inference_steps=4,  # schnell uses few steps
+    guidance_scale=0.0,     # schnell doesn't use CFG
+)
+image.save("output.png")
+```
+
+| Component | Description |
+|-----------|-------------|
+| **FluxTransformer** | 19 joint blocks + 38 single blocks |
+| **FluxScheduler** | Flow matching Euler scheduler |
+| **GPU-native ops** | Transpose, batched matmul, RoPE on GPU |
+| **RoPE frequencies** | Cached on GPU for efficient reuse |
+
+### Lazy Model Loading with Streaming
+Memory-efficient model loading strategies for large models:
+
+```python
+from pygpukit.llm import QwenModel, StreamingStrategy
+
+# Progressive loading - load layers as needed
+model = QwenModel.from_safetensors(
+    "path/to/model",
+    streaming=StreamingStrategy.PROGRESSIVE
+)
+
+# Layer-by-layer streaming for memory-constrained environments
+model = QwenModel.from_safetensors(
+    "path/to/model",
+    streaming=StreamingStrategy.LAYER_BY_LAYER
+)
+```
+
+| Strategy | Description |
+|----------|-------------|
+| `EAGER` | Load all weights at once (default) |
+| `PROGRESSIVE` | Load weights progressively during first forward |
+| `LAYER_BY_LAYER` | Stream one layer at a time, minimal memory |
+
+### cuBLAS Dynamic Loader
+Runtime cuBLAS/cuBLASLt loading without compile-time CUDA Toolkit dependency:
+
+| Feature | Description |
+|---------|-------------|
+| **Dynamic DLL loading** | Searches CUDA_PATH, system PATH |
+| **Version detection** | Auto-selects cublasLt64_13/12/11.dll |
+| **Graceful fallback** | Uses native kernels if cuBLAS unavailable |
+
+### C++ Kernel Profiler
+Built-in CUDA kernel profiling with minimal overhead:
+
+```python
+from pygpukit import enable_profiling, get_profile_stats
+
+enable_profiling(True)
+# ... run your code ...
+stats = get_profile_stats()
+for name, info in stats.items():
+    print(f"{name}: {info['avg_ms']:.3f} ms ({info['count']} calls)")
+```
+
+### HuggingFace T5 Encoder Support
+T5 text encoder with sharded safetensors for FLUX/SD3:
+
+| Feature | Description |
+|---------|-------------|
+| **Sharded loading** | Supports `model-00001-of-00002.safetensors` format |
+| **T5EncoderModel** | Full T5 encoder implementation |
+| **Automatic detection** | Finds encoder in model directories |
+
+### DiT Architecture Support
+Diffusion Transformer (DiT) components for PixArt and similar models:
+
+| Module | Description |
+|--------|-------------|
+| `dit/model.py` | PixArt transformer with AdaLN-Zero |
+| `dit/attention.py` | Self/cross attention with GQA |
+| `dit/embeddings.py` | Patch embed, timestep embed, 2D sincos pos |
+| `dit/adaln.py` | Adaptive LayerNorm modulation |
+| `dit/ffn.py` | GEGLU feed-forward network |
+
+### New GPU Operations
+| Operation | Description |
+|-----------|-------------|
+| `transpose_4d_0213` | GPU-native 4D transpose [B,S,H,D] -> [B,H,S,D] |
+| `transpose_3d_012` | GPU-native 3D transpose [B,S,D] -> [B,D,S] |
+| `gpu_batched_matmul` | Batched matrix multiplication |
+| `gpu_softmax` | GPU-native softmax |
+| `gpu_apply_rope` | Apply rotary position embedding |
+| `cross_attention` | Cross-attention for text conditioning |
+| `conv2d` | 2D convolution for VAE/UNet |
+| `group_norm` | Group normalization |
+
+---
+
 ## What's New in v0.2.18
 
 ### Major Codebase Refactoring
@@ -595,6 +703,7 @@ PyGPUkit/
 | **v0.2.16** | **MoE support** (Mixtral), Thinking models (Qwen3), W8A8/W4A4 GEMV, W8A16/Int8/Int4 GEMM, Kernel restructure |
 | **v0.2.17** | **Triton backend** MVP, hybrid execution (Triton + Native CUDA), TritonArray wrapper |
 | **v0.2.18** | **Codebase refactoring**, Kokoro TTS, Positional encoding (PoPE/ALiBi/YaRN/NTK), ReLU², Unified benchmark, BF16 GEMV (98% BW), W8A16 fix |
+| **v0.2.19** | **FLUX.1 image generation**, Lazy model loading (streaming), cuBLAS dynamic loader, C++ kernel profiler, T5 encoder, DiT architecture, GPU-native diffusion ops |
 
 ### Planned
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "scikit_build_core.build"
 
 [project]
 name = "PyGPUkit"
-version = "0.2.18"
+version = "0.2.19"
 description = "A lightweight GPU runtime for Python with Rust-powered scheduler, NVRTC JIT compilation, and NumPy-like API"
 readme = "README.md"
 license = "MIT"

diff --git a/src/pygpukit/benchmark/results.py b/src/pygpukit/benchmark/results.py
@@ -57,7 +57,7 @@ class BenchmarkReport:
     gpu: GPUInfo
     results: list[BenchmarkResult] = field(default_factory=list)
     timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
-    version: str = "0.2.18"
+    version: str = "0.2.19"
 
     def add(self, result: BenchmarkResult) -> None:
         self.results.append(result)

diff --git a/src/pygpukit/diffusion/models/dit/embeddings.py b/src/pygpukit/diffusion/models/dit/embeddings.py
@@ -64,10 +64,10 @@ def get_2d_sincos_pos_embed(embed_dim: int, grid_size: int | tuple[int, int]) ->
 
     # Create 2D grid in column-major order (h varies first)
     # This matches diffusers: for each column, iterate through rows
-    h_grid, w_grid = np.meshgrid(grid_h_pos, grid_w_pos, indexing='ij')
+    h_grid, w_grid = np.meshgrid(grid_h_pos, grid_w_pos, indexing="ij")
     # Flatten in Fortran order (column-major) to match diffusers patch ordering
-    h_flat = h_grid.flatten('F')  # [H*W]
-    w_flat = w_grid.flatten('F')  # [H*W]
+    h_flat = h_grid.flatten("F")  # [H*W]
+    w_flat = w_grid.flatten("F")  # [H*W]
 
     # Get embeddings for each dimension
     emb_h = sinusoidal_embedding(h_flat, embed_dim // 2)  # height embedding

diff --git a/src/pygpukit/diffusion/models/dit/model.py b/src/pygpukit/diffusion/models/dit/model.py
@@ -98,7 +98,11 @@ def from_safetensors(
 
         # Detect spec from weights
         hidden_size = weights["pos_embed.proj.bias"].shape[0]
-        num_blocks = sum(1 for k in weights if k.startswith("transformer_blocks.") and k.endswith(".attn1.to_q.weight"))
+        num_blocks = sum(
+            1
+            for k in weights
+            if k.startswith("transformer_blocks.") and k.endswith(".attn1.to_q.weight")
+        )
 
         spec = PixArtSpec(
             name="pixart_sigma",
@@ -179,7 +183,9 @@ def _patch_embed(self, x: GPUArray) -> GPUArray:
         # Add 2D sinusoidal positional embedding
         pos_embed = get_2d_sincos_pos_embed(self.hidden_size, (h_patches, w_patches))
         x_proj_np = x_proj.to_numpy()
-        x_proj_np = x_proj_np + pos_embed[None, :, :]  # [1, num_patches, D] broadcast to [B, num_patches, D]
+        x_proj_np = (
+            x_proj_np + pos_embed[None, :, :]
+        )  # [1, num_patches, D] broadcast to [B, num_patches, D]
 
         return from_numpy(x_proj_np.astype(np.float32))
 
@@ -325,8 +331,15 @@ def _self_attention(self, x: GPUArray, layer_idx: int) -> GPUArray:
             return x
 
         return self_attention(
-            x, q_w, k_w, v_w, out_w,
-            q_b, k_b, v_b, out_b,
+            x,
+            q_w,
+            k_w,
+            v_w,
+            out_w,
+            q_b,
+            k_b,
+            v_b,
+            out_b,
             num_heads=self.num_heads,
         )
 
@@ -348,8 +361,16 @@ def _cross_attention(self, x: GPUArray, context: GPUArray, layer_idx: int) -> GP
             return from_numpy(np.zeros_like(x.to_numpy()))
 
         return cross_attention(
-            x, context, q_w, k_w, v_w, out_w,
-            q_b, k_b, v_b, out_b,
+            x,
+            context,
+            q_w,
+            k_w,
+            v_w,
+            out_w,
+            q_b,
+            k_b,
+            v_b,
+            out_b,
             num_heads=self.num_heads,
         )
 
@@ -398,7 +419,9 @@ def _final_layer(self, x: GPUArray, t_emb: GPUArray, H: int, W: int) -> GPUArray
 
         if proj_w is not None:
             return unpatchify(
-                x, H, W,
+                x,
+                H,
+                W,
                 out_channels=self.spec.out_channels,
                 patch_size=self.patch_size,
                 proj_weight=proj_w,

diff --git a/src/pygpukit/diffusion/models/flux/blocks.py b/src/pygpukit/diffusion/models/flux/blocks.py
@@ -161,6 +161,7 @@ def joint_block(
     Returns:
         Tuple of (image_output, text_output).
     """
+
     # Get weights helper
     def get_weight(name: str) -> GPUArray | None:
         return weights.get(f"{prefix}.{name}")

diff --git a/src/pygpukit/diffusion/models/flux/model.py b/src/pygpukit/diffusion/models/flux/model.py
@@ -270,7 +270,9 @@ def forward(
         # [B, txt_seq_len, 4096] -> [B, txt_seq_len, hidden_size]
         txt_2d = encoder_hidden_states.reshape(B * txt_seq_len, self.config.joint_attention_dim)
         txt = gpu_linear(
-            txt_2d, self.weights["context_embedder.weight"], self.weights.get("context_embedder.bias")
+            txt_2d,
+            self.weights["context_embedder.weight"],
+            self.weights.get("context_embedder.bias"),
         )
         txt = txt.reshape(B, txt_seq_len, self.config.hidden_size)