-
Notifications
You must be signed in to change notification settings - Fork 186
update zoe qwen-image #1012
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
update zoe qwen-image #1012
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,11 @@ | ||
| { | ||
| "infer_steps": 4, | ||
| "max_custom_size": 4096, | ||
| "prompt_template_encode": "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n", | ||
| "prompt_template_encode_start_idx": 34, | ||
| "attn_type": "sage_attn2", | ||
| "enable_cfg": false, | ||
| "dit_original_ckpt": "/data/nvme1/yongyang/ccc/models/distill_zoe_diff_qwen_image_data_680w_neo_prompt_res2k_3kiter_multi_large_char_200iter_step4.safetensors", | ||
| "sample_shift": 5.0, | ||
| "zoe_style_noise": true | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,13 @@ | ||
| { | ||
| "infer_steps": 4, | ||
| "max_custom_size": 4096, | ||
| "prompt_template_encode": "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n", | ||
| "prompt_template_encode_start_idx": 34, | ||
| "attn_type": "sage_attn2", | ||
| "enable_cfg": false, | ||
| "dit_quantized": true, | ||
| "dit_quantized_ckpt": "/data/nvme1/yongyang/ccc/models/distill_zoe_diff_qwen_image_data_680w_neo_prompt_res2k_3kiter_multi_large_char_200iter_step4_fp8_mix.safetensors", | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| "dit_quant_scheme": "fp8-sgl", | ||
| "sample_shift": 5.0, | ||
| "zoe_style_noise": true | ||
| } | ||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -37,6 +37,11 @@ def calculate_shift( | |||||
| return mu | ||||||
|
|
||||||
|
|
||||||
| def time_shift_linear(mu: float, t: torch.Tensor) -> torch.Tensor: | ||||||
| """Linear time shift: mu / (mu + (1/t - 1)), matching zoe-diffusion's implementation.""" | ||||||
| return mu / (mu + (1.0 / t - 1.0)) | ||||||
|
|
||||||
|
|
||||||
| # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps | ||||||
| def retrieve_timesteps( | ||||||
| scheduler, | ||||||
|
|
@@ -428,7 +433,7 @@ def __init__(self, config): | |||||
| with open(os.path.join(config["model_path"], "scheduler", "scheduler_config.json"), "r") as f: | ||||||
| self.scheduler_config = json.load(f) | ||||||
| self.dtype = torch.bfloat16 | ||||||
| self.sample_guide_scale = self.config["sample_guide_scale"] | ||||||
| self.sample_guide_scale = self.config.get("sample_guide_scale", None) | ||||||
| self.zero_cond_t = config.get("zero_cond_t", False) | ||||||
| if self.config["seq_parallel"]: | ||||||
| self.seq_p_group = self.config.get("device_mesh").get_group(mesh_dim="seq_p") | ||||||
|
|
@@ -480,43 +485,84 @@ def _prepare_latent_image_ids(batch_size, height, width, device, dtype): | |||||
|
|
||||||
| return latent_image_ids.to(device=device, dtype=dtype) | ||||||
|
|
||||||
| def _prepare_latents_lightx2v(self, shape, height, width, num_channels_latents): | ||||||
| """Original LightX2V latent generation: noise in [B, T, C, H, W] then pack.""" | ||||||
| latents = randn_tensor(shape, generator=self.generator, device=AI_DEVICE, dtype=self.dtype) | ||||||
| if self.is_layered: | ||||||
| latents = self._pack_latents(latents, 1, num_channels_latents, height, width, self.layers + 1) | ||||||
| else: | ||||||
| latents = self._pack_latents(latents, 1, num_channels_latents, height, width) | ||||||
| return latents | ||||||
|
|
||||||
| def _prepare_latents_zoe(self, shape, height, width, num_channels_latents): | ||||||
| """Zoe-aligned latent generation: noise in packed format [B, C*4, T, H//2, W//2]. | ||||||
| Ensures the same random sampling order as Zoe for bit-exact alignment. | ||||||
| """ | ||||||
| b, t = shape[0], shape[1] | ||||||
| zoe_shape = (b, num_channels_latents * 4, t, height // 2, width // 2) | ||||||
| latents = randn_tensor(zoe_shape, generator=self.generator, device=AI_DEVICE, dtype=self.dtype) | ||||||
| # Convert to LightX2V sequence format: [B, (H//2)*(W//2), C*4] | ||||||
| latents = latents.squeeze(2) # [B, C*4, H//2, W//2] | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Using
Suggested change
|
||||||
| latents = latents.permute(0, 2, 3, 1) # [B, H//2, W//2, C*4] | ||||||
| latents = latents.reshape(b, (height // 2) * (width // 2), num_channels_latents * 4) | ||||||
| return latents | ||||||
|
|
||||||
| def prepare_latents(self, input_info): | ||||||
| self.input_info = input_info | ||||||
| shape = input_info.target_shape | ||||||
| # shape: [B, T, C, H, W] | ||||||
| width, height = shape[-1], shape[-2] | ||||||
| latents = randn_tensor(shape, generator=self.generator, device=AI_DEVICE, dtype=self.dtype) | ||||||
| if self.is_layered: | ||||||
| latents = self._pack_latents(latents, 1, self.config.get("num_channels_latents", 16), height, width, self.layers + 1) | ||||||
| num_channels_latents = self.config.get("num_channels_latents", 16) | ||||||
|
|
||||||
| if self.config.get("zoe_style_noise", False) and not self.is_layered: | ||||||
| latents = self._prepare_latents_zoe(shape, height, width, num_channels_latents) | ||||||
| else: | ||||||
| latents = self._pack_latents(latents, 1, self.config.get("num_channels_latents", 16), height, width) | ||||||
| latents = self._prepare_latents_lightx2v(shape, height, width, num_channels_latents) | ||||||
|
|
||||||
| latent_image_ids = self._prepare_latent_image_ids(1, height // 2, width // 2, AI_DEVICE, self.dtype) | ||||||
| self.latents = latents | ||||||
| self.latent_image_ids = latent_image_ids | ||||||
| self.noise_pred = None | ||||||
|
|
||||||
| def set_timesteps(self): | ||||||
| sigmas = np.linspace(1.0, 1 / self.config["infer_steps"], self.config["infer_steps"]) | ||||||
| image_seq_len = self.latents.shape[1] | ||||||
| if self.is_layered: | ||||||
| base_seqlen = 256 * 256 / 16 / 16 | ||||||
| image_seq_len = self.latents.shape[1] // 5 | ||||||
| mu = (image_seq_len / base_seqlen) ** 0.5 | ||||||
| num_inference_steps = self.config["infer_steps"] | ||||||
| sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) | ||||||
|
|
||||||
| sample_shift = self.config.get("sample_shift", None) | ||||||
| if sample_shift is not None: | ||||||
| # Zoe-style: linear time shift with a fixed mu, resolution-independent. | ||||||
| # Formula: t_shifted = mu / (mu + (1/t - 1)) | ||||||
| sigmas_tensor = torch.from_numpy(sigmas).float().to(AI_DEVICE) | ||||||
| sigmas_shifted = time_shift_linear(mu=sample_shift, t=sigmas_tensor) | ||||||
| sigmas_shifted = torch.cat([sigmas_shifted, torch.zeros(1, device=AI_DEVICE)]) | ||||||
| self.scheduler.sigmas = sigmas_shifted.to(dtype=torch.float32, device=AI_DEVICE) | ||||||
| self.scheduler.timesteps = sigmas_shifted[:-1] * self.scheduler_config["num_train_timesteps"] | ||||||
| self.scheduler.timesteps = self.scheduler.timesteps.to(AI_DEVICE) | ||||||
| self.scheduler._step_index = None | ||||||
| self.scheduler._begin_index = None | ||||||
| timesteps = self.scheduler.timesteps | ||||||
| else: | ||||||
| mu = calculate_shift( | ||||||
| image_seq_len, | ||||||
| self.scheduler_config.get("base_image_seq_len", 256), | ||||||
| self.scheduler_config.get("max_image_seq_len", 4096), | ||||||
| self.scheduler_config.get("base_shift", 0.5), | ||||||
| self.scheduler_config.get("max_shift", 1.15), | ||||||
| # Original: resolution-adaptive exponential shift via diffusers. | ||||||
| image_seq_len = self.latents.shape[1] | ||||||
| if self.is_layered: | ||||||
| base_seqlen = 256 * 256 / 16 / 16 | ||||||
| image_seq_len = self.latents.shape[1] // 5 | ||||||
| mu = (image_seq_len / base_seqlen) ** 0.5 | ||||||
| else: | ||||||
| mu = calculate_shift( | ||||||
| image_seq_len, | ||||||
| self.scheduler_config.get("base_image_seq_len", 256), | ||||||
| self.scheduler_config.get("max_image_seq_len", 4096), | ||||||
| self.scheduler_config.get("base_shift", 0.5), | ||||||
| self.scheduler_config.get("max_shift", 1.15), | ||||||
| ) | ||||||
| timesteps, num_inference_steps = retrieve_timesteps( | ||||||
| self.scheduler, | ||||||
| num_inference_steps, | ||||||
| AI_DEVICE, | ||||||
| sigmas=sigmas, | ||||||
| mu=mu, | ||||||
| ) | ||||||
| num_inference_steps = self.config["infer_steps"] | ||||||
| timesteps, num_inference_steps = retrieve_timesteps( | ||||||
| self.scheduler, | ||||||
| num_inference_steps, | ||||||
| AI_DEVICE, | ||||||
| sigmas=sigmas, | ||||||
| mu=mu, | ||||||
| ) | ||||||
|
|
||||||
| self.timesteps = timesteps | ||||||
| self.infer_steps = num_inference_steps | ||||||
|
|
||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,21 @@ | ||
| #!/bin/bash | ||
|
|
||
| # set path firstly | ||
| lightx2v_path=/data/nvme1/yongyang/ccc/LightX2V | ||
| model_path=/data/nvme1/models/Qwen/Qwen-Image-2512 | ||
|
Comment on lines
+4
to
+5
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
|
|
||
| export CUDA_VISIBLE_DEVICES=0 | ||
|
|
||
| # set environment variables | ||
| source ${lightx2v_path}/scripts/base/base.sh | ||
|
|
||
| python -m lightx2v.infer \ | ||
| --model_cls qwen_image \ | ||
| --task t2i \ | ||
| --model_path $model_path \ | ||
| --config_json ${lightx2v_path}/configs/qwen_image/qwen_image_t2i_2512_distill_zoe.json \ | ||
| --prompt '2K超高清画质,16:9宽屏比例,电影级渲染。一个精致的咖啡店门口场景,温馨的街道氛围。门口摆放着一个复古风格的木质黑板,黑板上用粉笔字体写着"日日新咖啡,2美元一杯",笔触温馨可爱。旁边有一个闪烁的霓虹灯招牌,红色霓虹灯管拼出"商汤科技"字样,现代科技感。旁边立着一幅精美的海报,海报上是一位优雅的中国美女模特,海报下方用时尚字体写着"SenseNova newbee"。整体氛围是东西方文化交融的现代咖啡馆,暖色调灯光,傍晚时分,细节精致,高质量渲染' \ | ||
| --negative_prompt " " \ | ||
| --save_result_path ${lightx2v_path}/save_results/qwen_image_t2i_2512_distill_zoe.png \ | ||
| --seed 42 \ | ||
| --target_shape 1536 2752 | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,21 @@ | ||
| #!/bin/bash | ||
|
|
||
| # set path firstly | ||
| lightx2v_path=/data/nvme1/yongyang/ccc/LightX2V | ||
| model_path=/data/nvme1/models/Qwen/Qwen-Image-2512 | ||
|
Comment on lines
+4
to
+5
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
|
|
||
| export CUDA_VISIBLE_DEVICES=0 | ||
|
|
||
| # set environment variables | ||
| source ${lightx2v_path}/scripts/base/base.sh | ||
|
|
||
| python -m lightx2v.infer \ | ||
| --model_cls qwen_image \ | ||
| --task t2i \ | ||
| --model_path $model_path \ | ||
| --config_json ${lightx2v_path}/configs/qwen_image/qwen_image_t2i_2512_distill_zoe_fp8.json \ | ||
| --prompt '2K超高清画质,16:9宽屏比例,电影级渲染。一个精致的咖啡店门口场景,温馨的街道氛围。门口摆放着一个复古风格的木质黑板,黑板上用粉笔字体写着"日日新咖啡,2美元一杯",笔触温馨可爱。旁边有一个闪烁的霓虹灯招牌,红色霓虹灯管拼出"商汤科技"字样,现代科技感。旁边立着一幅精美的海报,海报上是一位优雅的中国美女模特,海报下方用时尚字体写着"SenseNova newbee"。整体氛围是东西方文化交融的现代咖啡馆,暖色调灯光,傍晚时分,细节精致,高质量渲染' \ | ||
| --negative_prompt " " \ | ||
| --save_result_path ${lightx2v_path}/save_results/qwen_image_t2i_2512_distill_zoe_fp83.png \ | ||
| --seed 42 \ | ||
| --target_shape 1536 2752 | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The path to the checkpoint is hardcoded to a specific absolute path (
/data/nvme1/yongyang/...). This makes the configuration file non-portable and will cause failures in other environments. Consider using a relative path or a mechanism to resolve the model path dynamically.