diff --git a/README.md b/README.md index ad30e5b..410ec73 100644 --- a/README.md +++ b/README.md @@ -99,6 +99,114 @@ They were all observed in production or real benchmarks. --- +## What's New in v0.2.19 + +### FLUX.1 Image Generation +Text-to-image generation with Black Forest Labs' FLUX.1 model: + +```python +from pygpukit.diffusion import FluxPipeline + +# Load FLUX.1-schnell (fast variant) +pipeline = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell") + +# Generate image +image = pipeline.generate( + prompt="a photo of a cat sitting on a windowsill", + num_inference_steps=4, # schnell uses few steps + guidance_scale=0.0, # schnell doesn't use CFG +) +image.save("output.png") +``` + +| Component | Description | +|-----------|-------------| +| **FluxTransformer** | 19 joint blocks + 38 single blocks | +| **FluxScheduler** | Flow matching Euler scheduler | +| **GPU-native ops** | Transpose, batched matmul, RoPE on GPU | +| **RoPE frequencies** | Cached on GPU for efficient reuse | + +### Lazy Model Loading with Streaming +Memory-efficient model loading strategies for large models: + +```python +from pygpukit.llm import QwenModel, StreamingStrategy + +# Progressive loading - load layers as needed +model = QwenModel.from_safetensors( + "path/to/model", + streaming=StreamingStrategy.PROGRESSIVE +) + +# Layer-by-layer streaming for memory-constrained environments +model = QwenModel.from_safetensors( + "path/to/model", + streaming=StreamingStrategy.LAYER_BY_LAYER +) +``` + +| Strategy | Description | +|----------|-------------| +| `EAGER` | Load all weights at once (default) | +| `PROGRESSIVE` | Load weights progressively during first forward | +| `LAYER_BY_LAYER` | Stream one layer at a time, minimal memory | + +### cuBLAS Dynamic Loader +Runtime cuBLAS/cuBLASLt loading without compile-time CUDA Toolkit dependency: + +| Feature | Description | +|---------|-------------| +| **Dynamic DLL loading** | Searches CUDA_PATH, system PATH | +| **Version detection** | Auto-selects cublasLt64_13/12/11.dll | +| **Graceful fallback** | Uses native kernels if cuBLAS unavailable | + +### C++ Kernel Profiler +Built-in CUDA kernel profiling with minimal overhead: + +```python +from pygpukit import enable_profiling, get_profile_stats + +enable_profiling(True) +# ... run your code ... +stats = get_profile_stats() +for name, info in stats.items(): + print(f"{name}: {info['avg_ms']:.3f} ms ({info['count']} calls)") +``` + +### HuggingFace T5 Encoder Support +T5 text encoder with sharded safetensors for FLUX/SD3: + +| Feature | Description | +|---------|-------------| +| **Sharded loading** | Supports `model-00001-of-00002.safetensors` format | +| **T5EncoderModel** | Full T5 encoder implementation | +| **Automatic detection** | Finds encoder in model directories | + +### DiT Architecture Support +Diffusion Transformer (DiT) components for PixArt and similar models: + +| Module | Description | +|--------|-------------| +| `dit/model.py` | PixArt transformer with AdaLN-Zero | +| `dit/attention.py` | Self/cross attention with GQA | +| `dit/embeddings.py` | Patch embed, timestep embed, 2D sincos pos | +| `dit/adaln.py` | Adaptive LayerNorm modulation | +| `dit/ffn.py` | GEGLU feed-forward network | + +### New GPU Operations +| Operation | Description | +|-----------|-------------| +| `transpose_4d_0213` | GPU-native 4D transpose [B,S,H,D] -> [B,H,S,D] | +| `transpose_3d_012` | GPU-native 3D transpose [B,S,D] -> [B,D,S] | +| `gpu_batched_matmul` | Batched matrix multiplication | +| `gpu_softmax` | GPU-native softmax | +| `gpu_apply_rope` | Apply rotary position embedding | +| `cross_attention` | Cross-attention for text conditioning | +| `conv2d` | 2D convolution for VAE/UNet | +| `group_norm` | Group normalization | + +--- + ## What's New in v0.2.18 ### Major Codebase Refactoring @@ -595,6 +703,7 @@ PyGPUkit/ | **v0.2.16** | **MoE support** (Mixtral), Thinking models (Qwen3), W8A8/W4A4 GEMV, W8A16/Int8/Int4 GEMM, Kernel restructure | | **v0.2.17** | **Triton backend** MVP, hybrid execution (Triton + Native CUDA), TritonArray wrapper | | **v0.2.18** | **Codebase refactoring**, Kokoro TTS, Positional encoding (PoPE/ALiBi/YaRN/NTK), ReLU², Unified benchmark, BF16 GEMV (98% BW), W8A16 fix | +| **v0.2.19** | **FLUX.1 image generation**, Lazy model loading (streaming), cuBLAS dynamic loader, C++ kernel profiler, T5 encoder, DiT architecture, GPU-native diffusion ops | ### Planned diff --git a/pyproject.toml b/pyproject.toml index c9d823b..59422a9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "scikit_build_core.build" [project] name = "PyGPUkit" -version = "0.2.18" +version = "0.2.19" description = "A lightweight GPU runtime for Python with Rust-powered scheduler, NVRTC JIT compilation, and NumPy-like API" readme = "README.md" license = "MIT" diff --git a/src/pygpukit/benchmark/results.py b/src/pygpukit/benchmark/results.py index 268f7dc..45c5ac3 100644 --- a/src/pygpukit/benchmark/results.py +++ b/src/pygpukit/benchmark/results.py @@ -57,7 +57,7 @@ class BenchmarkReport: gpu: GPUInfo results: list[BenchmarkResult] = field(default_factory=list) timestamp: str = field(default_factory=lambda: datetime.now().isoformat()) - version: str = "0.2.18" + version: str = "0.2.19" def add(self, result: BenchmarkResult) -> None: self.results.append(result) diff --git a/src/pygpukit/diffusion/models/dit/embeddings.py b/src/pygpukit/diffusion/models/dit/embeddings.py index a4d935e..987ef6f 100644 --- a/src/pygpukit/diffusion/models/dit/embeddings.py +++ b/src/pygpukit/diffusion/models/dit/embeddings.py @@ -64,10 +64,10 @@ def get_2d_sincos_pos_embed(embed_dim: int, grid_size: int | tuple[int, int]) -> # Create 2D grid in column-major order (h varies first) # This matches diffusers: for each column, iterate through rows - h_grid, w_grid = np.meshgrid(grid_h_pos, grid_w_pos, indexing='ij') + h_grid, w_grid = np.meshgrid(grid_h_pos, grid_w_pos, indexing="ij") # Flatten in Fortran order (column-major) to match diffusers patch ordering - h_flat = h_grid.flatten('F') # [H*W] - w_flat = w_grid.flatten('F') # [H*W] + h_flat = h_grid.flatten("F") # [H*W] + w_flat = w_grid.flatten("F") # [H*W] # Get embeddings for each dimension emb_h = sinusoidal_embedding(h_flat, embed_dim // 2) # height embedding diff --git a/src/pygpukit/diffusion/models/dit/model.py b/src/pygpukit/diffusion/models/dit/model.py index 797be47..4c4611a 100644 --- a/src/pygpukit/diffusion/models/dit/model.py +++ b/src/pygpukit/diffusion/models/dit/model.py @@ -98,7 +98,11 @@ def from_safetensors( # Detect spec from weights hidden_size = weights["pos_embed.proj.bias"].shape[0] - num_blocks = sum(1 for k in weights if k.startswith("transformer_blocks.") and k.endswith(".attn1.to_q.weight")) + num_blocks = sum( + 1 + for k in weights + if k.startswith("transformer_blocks.") and k.endswith(".attn1.to_q.weight") + ) spec = PixArtSpec( name="pixart_sigma", @@ -179,7 +183,9 @@ def _patch_embed(self, x: GPUArray) -> GPUArray: # Add 2D sinusoidal positional embedding pos_embed = get_2d_sincos_pos_embed(self.hidden_size, (h_patches, w_patches)) x_proj_np = x_proj.to_numpy() - x_proj_np = x_proj_np + pos_embed[None, :, :] # [1, num_patches, D] broadcast to [B, num_patches, D] + x_proj_np = ( + x_proj_np + pos_embed[None, :, :] + ) # [1, num_patches, D] broadcast to [B, num_patches, D] return from_numpy(x_proj_np.astype(np.float32)) @@ -325,8 +331,15 @@ def _self_attention(self, x: GPUArray, layer_idx: int) -> GPUArray: return x return self_attention( - x, q_w, k_w, v_w, out_w, - q_b, k_b, v_b, out_b, + x, + q_w, + k_w, + v_w, + out_w, + q_b, + k_b, + v_b, + out_b, num_heads=self.num_heads, ) @@ -348,8 +361,16 @@ def _cross_attention(self, x: GPUArray, context: GPUArray, layer_idx: int) -> GP return from_numpy(np.zeros_like(x.to_numpy())) return cross_attention( - x, context, q_w, k_w, v_w, out_w, - q_b, k_b, v_b, out_b, + x, + context, + q_w, + k_w, + v_w, + out_w, + q_b, + k_b, + v_b, + out_b, num_heads=self.num_heads, ) @@ -398,7 +419,9 @@ def _final_layer(self, x: GPUArray, t_emb: GPUArray, H: int, W: int) -> GPUArray if proj_w is not None: return unpatchify( - x, H, W, + x, + H, + W, out_channels=self.spec.out_channels, patch_size=self.patch_size, proj_weight=proj_w, diff --git a/src/pygpukit/diffusion/models/flux/blocks.py b/src/pygpukit/diffusion/models/flux/blocks.py index 42268c5..001b3cf 100644 --- a/src/pygpukit/diffusion/models/flux/blocks.py +++ b/src/pygpukit/diffusion/models/flux/blocks.py @@ -161,6 +161,7 @@ def joint_block( Returns: Tuple of (image_output, text_output). """ + # Get weights helper def get_weight(name: str) -> GPUArray | None: return weights.get(f"{prefix}.{name}") diff --git a/src/pygpukit/diffusion/models/flux/model.py b/src/pygpukit/diffusion/models/flux/model.py index 1fd567b..98688e7 100644 --- a/src/pygpukit/diffusion/models/flux/model.py +++ b/src/pygpukit/diffusion/models/flux/model.py @@ -270,7 +270,9 @@ def forward( # [B, txt_seq_len, 4096] -> [B, txt_seq_len, hidden_size] txt_2d = encoder_hidden_states.reshape(B * txt_seq_len, self.config.joint_attention_dim) txt = gpu_linear( - txt_2d, self.weights["context_embedder.weight"], self.weights.get("context_embedder.bias") + txt_2d, + self.weights["context_embedder.weight"], + self.weights.get("context_embedder.bias"), ) txt = txt.reshape(B, txt_seq_len, self.config.hidden_size)