Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions configs/qwen_image/qwen_image_t2i_2512_distill_zoe.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"infer_steps": 4,
"max_custom_size": 4096,
"prompt_template_encode": "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n",
"prompt_template_encode_start_idx": 34,
"attn_type": "sage_attn2",
"enable_cfg": false,
"dit_original_ckpt": "/data/nvme1/yongyang/ccc/models/distill_zoe_diff_qwen_image_data_680w_neo_prompt_res2k_3kiter_multi_large_char_200iter_step4.safetensors",
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The path to the checkpoint is hardcoded to a specific absolute path (/data/nvme1/yongyang/...). This makes the configuration file non-portable and will cause failures in other environments. Consider using a relative path or a mechanism to resolve the model path dynamically.

"sample_shift": 5.0,
"zoe_style_noise": true
}
13 changes: 13 additions & 0 deletions configs/qwen_image/qwen_image_t2i_2512_distill_zoe_fp8.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"infer_steps": 4,
"max_custom_size": 4096,
"prompt_template_encode": "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n",
"prompt_template_encode_start_idx": 34,
"attn_type": "sage_attn2",
"enable_cfg": false,
"dit_quantized": true,
"dit_quantized_ckpt": "/data/nvme1/yongyang/ccc/models/distill_zoe_diff_qwen_image_data_680w_neo_prompt_res2k_3kiter_multi_large_char_200iter_step4_fp8_mix.safetensors",
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The path to the quantized checkpoint is hardcoded to a specific absolute path. This will cause the configuration to fail in any environment other than the one where it was created. Please use relative paths or environment variables.

"dit_quant_scheme": "fp8-sgl",
"sample_shift": 5.0,
"zoe_style_noise": true
}
96 changes: 71 additions & 25 deletions lightx2v/models/schedulers/qwen_image/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,11 @@ def calculate_shift(
return mu


def time_shift_linear(mu: float, t: torch.Tensor) -> torch.Tensor:
"""Linear time shift: mu / (mu + (1/t - 1)), matching zoe-diffusion's implementation."""
return mu / (mu + (1.0 / t - 1.0))


# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
def retrieve_timesteps(
scheduler,
Expand Down Expand Up @@ -428,7 +433,7 @@ def __init__(self, config):
with open(os.path.join(config["model_path"], "scheduler", "scheduler_config.json"), "r") as f:
self.scheduler_config = json.load(f)
self.dtype = torch.bfloat16
self.sample_guide_scale = self.config["sample_guide_scale"]
self.sample_guide_scale = self.config.get("sample_guide_scale", None)
self.zero_cond_t = config.get("zero_cond_t", False)
if self.config["seq_parallel"]:
self.seq_p_group = self.config.get("device_mesh").get_group(mesh_dim="seq_p")
Expand Down Expand Up @@ -480,43 +485,84 @@ def _prepare_latent_image_ids(batch_size, height, width, device, dtype):

return latent_image_ids.to(device=device, dtype=dtype)

def _prepare_latents_lightx2v(self, shape, height, width, num_channels_latents):
"""Original LightX2V latent generation: noise in [B, T, C, H, W] then pack."""
latents = randn_tensor(shape, generator=self.generator, device=AI_DEVICE, dtype=self.dtype)
if self.is_layered:
latents = self._pack_latents(latents, 1, num_channels_latents, height, width, self.layers + 1)
else:
latents = self._pack_latents(latents, 1, num_channels_latents, height, width)
return latents

def _prepare_latents_zoe(self, shape, height, width, num_channels_latents):
"""Zoe-aligned latent generation: noise in packed format [B, C*4, T, H//2, W//2].
Ensures the same random sampling order as Zoe for bit-exact alignment.
"""
b, t = shape[0], shape[1]
zoe_shape = (b, num_channels_latents * 4, t, height // 2, width // 2)
latents = randn_tensor(zoe_shape, generator=self.generator, device=AI_DEVICE, dtype=self.dtype)
# Convert to LightX2V sequence format: [B, (H//2)*(W//2), C*4]
latents = latents.squeeze(2) # [B, C*4, H//2, W//2]
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Using squeeze(2) is fragile because it only removes the dimension if its size is exactly 1. If the temporal dimension t is greater than 1, this operation will do nothing, and the subsequent permute call will fail as it expects a 4D tensor but receives a 5D one. Since this is an image-specific latent preparation, it is safer to explicitly select the first frame to ensure the expected 4D shape [B, C*4, H//2, W//2].

Suggested change
latents = latents.squeeze(2) # [B, C*4, H//2, W//2]
latents = latents[:, :, 0, :, :] # [B, C*4, H//2, W//2]

latents = latents.permute(0, 2, 3, 1) # [B, H//2, W//2, C*4]
latents = latents.reshape(b, (height // 2) * (width // 2), num_channels_latents * 4)
return latents

def prepare_latents(self, input_info):
self.input_info = input_info
shape = input_info.target_shape
# shape: [B, T, C, H, W]
width, height = shape[-1], shape[-2]
latents = randn_tensor(shape, generator=self.generator, device=AI_DEVICE, dtype=self.dtype)
if self.is_layered:
latents = self._pack_latents(latents, 1, self.config.get("num_channels_latents", 16), height, width, self.layers + 1)
num_channels_latents = self.config.get("num_channels_latents", 16)

if self.config.get("zoe_style_noise", False) and not self.is_layered:
latents = self._prepare_latents_zoe(shape, height, width, num_channels_latents)
else:
latents = self._pack_latents(latents, 1, self.config.get("num_channels_latents", 16), height, width)
latents = self._prepare_latents_lightx2v(shape, height, width, num_channels_latents)

latent_image_ids = self._prepare_latent_image_ids(1, height // 2, width // 2, AI_DEVICE, self.dtype)
self.latents = latents
self.latent_image_ids = latent_image_ids
self.noise_pred = None

def set_timesteps(self):
sigmas = np.linspace(1.0, 1 / self.config["infer_steps"], self.config["infer_steps"])
image_seq_len = self.latents.shape[1]
if self.is_layered:
base_seqlen = 256 * 256 / 16 / 16
image_seq_len = self.latents.shape[1] // 5
mu = (image_seq_len / base_seqlen) ** 0.5
num_inference_steps = self.config["infer_steps"]
sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)

sample_shift = self.config.get("sample_shift", None)
if sample_shift is not None:
# Zoe-style: linear time shift with a fixed mu, resolution-independent.
# Formula: t_shifted = mu / (mu + (1/t - 1))
sigmas_tensor = torch.from_numpy(sigmas).float().to(AI_DEVICE)
sigmas_shifted = time_shift_linear(mu=sample_shift, t=sigmas_tensor)
sigmas_shifted = torch.cat([sigmas_shifted, torch.zeros(1, device=AI_DEVICE)])
self.scheduler.sigmas = sigmas_shifted.to(dtype=torch.float32, device=AI_DEVICE)
self.scheduler.timesteps = sigmas_shifted[:-1] * self.scheduler_config["num_train_timesteps"]
self.scheduler.timesteps = self.scheduler.timesteps.to(AI_DEVICE)
self.scheduler._step_index = None
self.scheduler._begin_index = None
timesteps = self.scheduler.timesteps
else:
mu = calculate_shift(
image_seq_len,
self.scheduler_config.get("base_image_seq_len", 256),
self.scheduler_config.get("max_image_seq_len", 4096),
self.scheduler_config.get("base_shift", 0.5),
self.scheduler_config.get("max_shift", 1.15),
# Original: resolution-adaptive exponential shift via diffusers.
image_seq_len = self.latents.shape[1]
if self.is_layered:
base_seqlen = 256 * 256 / 16 / 16
image_seq_len = self.latents.shape[1] // 5
mu = (image_seq_len / base_seqlen) ** 0.5
else:
mu = calculate_shift(
image_seq_len,
self.scheduler_config.get("base_image_seq_len", 256),
self.scheduler_config.get("max_image_seq_len", 4096),
self.scheduler_config.get("base_shift", 0.5),
self.scheduler_config.get("max_shift", 1.15),
)
timesteps, num_inference_steps = retrieve_timesteps(
self.scheduler,
num_inference_steps,
AI_DEVICE,
sigmas=sigmas,
mu=mu,
)
num_inference_steps = self.config["infer_steps"]
timesteps, num_inference_steps = retrieve_timesteps(
self.scheduler,
num_inference_steps,
AI_DEVICE,
sigmas=sigmas,
mu=mu,
)

self.timesteps = timesteps
self.infer_steps = num_inference_steps
Expand Down
21 changes: 21 additions & 0 deletions scripts/qwen_image/qwen_image_t2i_2512_distill_zoe.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/bin/bash

# set path firstly
lightx2v_path=/data/nvme1/yongyang/ccc/LightX2V
model_path=/data/nvme1/models/Qwen/Qwen-Image-2512
Comment on lines +4 to +5
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Hardcoded absolute paths for lightx2v_path and model_path limit the usability of this script across different systems. It is recommended to use relative paths or allow these paths to be passed as environment variables or command-line arguments.


export CUDA_VISIBLE_DEVICES=0

# set environment variables
source ${lightx2v_path}/scripts/base/base.sh

python -m lightx2v.infer \
--model_cls qwen_image \
--task t2i \
--model_path $model_path \
--config_json ${lightx2v_path}/configs/qwen_image/qwen_image_t2i_2512_distill_zoe.json \
--prompt '2K超高清画质,16:9宽屏比例,电影级渲染。一个精致的咖啡店门口场景,温馨的街道氛围。门口摆放着一个复古风格的木质黑板,黑板上用粉笔字体写着"日日新咖啡,2美元一杯",笔触温馨可爱。旁边有一个闪烁的霓虹灯招牌,红色霓虹灯管拼出"商汤科技"字样,现代科技感。旁边立着一幅精美的海报,海报上是一位优雅的中国美女模特,海报下方用时尚字体写着"SenseNova newbee"。整体氛围是东西方文化交融的现代咖啡馆,暖色调灯光,傍晚时分,细节精致,高质量渲染' \
--negative_prompt " " \
--save_result_path ${lightx2v_path}/save_results/qwen_image_t2i_2512_distill_zoe.png \
--seed 42 \
--target_shape 1536 2752
21 changes: 21 additions & 0 deletions scripts/qwen_image/qwen_image_t2i_2512_distill_zoe_fp8.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/bin/bash

# set path firstly
lightx2v_path=/data/nvme1/yongyang/ccc/LightX2V
model_path=/data/nvme1/models/Qwen/Qwen-Image-2512
Comment on lines +4 to +5
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

These absolute paths are specific to the author's environment. For better maintainability and sharing, consider using relative paths or environment variables.


export CUDA_VISIBLE_DEVICES=0

# set environment variables
source ${lightx2v_path}/scripts/base/base.sh

python -m lightx2v.infer \
--model_cls qwen_image \
--task t2i \
--model_path $model_path \
--config_json ${lightx2v_path}/configs/qwen_image/qwen_image_t2i_2512_distill_zoe_fp8.json \
--prompt '2K超高清画质,16:9宽屏比例,电影级渲染。一个精致的咖啡店门口场景,温馨的街道氛围。门口摆放着一个复古风格的木质黑板,黑板上用粉笔字体写着"日日新咖啡,2美元一杯",笔触温馨可爱。旁边有一个闪烁的霓虹灯招牌,红色霓虹灯管拼出"商汤科技"字样,现代科技感。旁边立着一幅精美的海报,海报上是一位优雅的中国美女模特,海报下方用时尚字体写着"SenseNova newbee"。整体氛围是东西方文化交融的现代咖啡馆,暖色调灯光,傍晚时分,细节精致,高质量渲染' \
--negative_prompt " " \
--save_result_path ${lightx2v_path}/save_results/qwen_image_t2i_2512_distill_zoe_fp83.png \
--seed 42 \
--target_shape 1536 2752
Loading