Skip to content

Commit b31ed8e

Browse files
committed
wan
1 parent a2bdc06 commit b31ed8e

File tree

5 files changed

+266
-147
lines changed

5 files changed

+266
-147
lines changed

docs/source/en/api/loaders/lora.md

+5-1
Original file line numberDiff line numberDiff line change
@@ -79,4 +79,8 @@ To learn more about how to load LoRA weights, see the [LoRA](../../using-diffuse
7979

8080
## LoraBaseMixin
8181

82-
[[autodoc]] loaders.lora_base.LoraBaseMixin
82+
[[autodoc]] loaders.lora_base.LoraBaseMixin
83+
84+
## WanLoraLoaderMixin
85+
86+
[[autodoc]] loaders.lora_pipeline.WanLoraLoaderMixin

docs/source/en/api/pipelines/cogvideox.md

+31-49
Original file line numberDiff line numberDiff line change
@@ -26,19 +26,32 @@
2626
You can find all the original CogVideoX checkpoints under the [CogVideoX](https://huggingface.co/collections/THUDM/cogvideo-66c08e62f1685a3ade464cce) collection.
2727

2828
> [!TIP]
29-
> Click on the CogVideoX models in the right sidebar for more examples of how to use CogVideoX for other video generation tasks.
29+
> Click on the CogVideoX models in the right sidebar for more examples of other video generation tasks.
3030
3131
The example below demonstrates how to generate a video optimized for memory or inference speed.
3232

3333
<hfoptions id="usage">
3434
<hfoption id="memory">
3535

36+
Refer to the [Reduce memory usage](../../optimization/memory) guide for more details about the various memory saving techniques.
37+
38+
The quantized CogVideoX 5B model below requires ~16GB of VRAM.
39+
3640
```py
3741
import torch
3842
from diffusers import CogVideoXPipeline, CogVideoXTransformer3DModel
3943
from diffusers.hooks import apply_group_offloading
4044
from diffusers.utils import export_to_video
4145

46+
# quantize weights to int8 with torchao
47+
quantization_config = TorchAoConfig("int8wo")
48+
transformer = CogVideoXTransformer3DModel.from_pretrained(
49+
"THUDM/CogVideoX-5b",
50+
subfolder="transformer",
51+
quantization_config=quantization_config,
52+
torch_dtype=torch.bfloat16,
53+
)
54+
4255
# fp8 layerwise weight-casting
4356
transformer = CogVideoXTransformer3DModel.from_pretrained(
4457
"THUDM/CogVideoX-5b",
@@ -60,10 +73,13 @@ pipeline.to("cuda")
6073
# model-offloading
6174
pipeline.enable_model_cpu_offload()
6275

63-
prompt = ("A detailed wooden toy ship with intricately carved masts and sails is seen gliding smoothly over a plush, blue carpet that mimics the waves of the sea. "
64-
"The ship's hull is painted a rich brown, with tiny windows. The carpet, soft and textured, provides a perfect backdrop, resembling an oceanic expanse. "
65-
"Surrounding the ship are various other toys and children's items, hinting at a playful environment. The scene captures the innocence and imagination of childhood, "
66-
"with the toy ship's journey symbolizing endless adventures in a whimsical, indoor setting.")
76+
prompt = """
77+
A detailed wooden toy ship with intricately carved masts and sails is seen gliding smoothly over a plush, blue carpet that mimics the waves of the sea.
78+
The ship's hull is painted a rich brown, with tiny windows. The carpet, soft and textured, provides a perfect backdrop, resembling an oceanic expanse.
79+
Surrounding the ship are various other toys and children's items, hinting at a playful environment. The scene captures the innocence and imagination of childhood,
80+
with the toy ship's journey symbolizing endless adventures in a whimsical, indoor setting.
81+
"""
82+
6783
video = pipeline(
6884
prompt=prompt,
6985
guidance_scale=6,
@@ -72,45 +88,6 @@ video = pipeline(
7288
export_to_video(video, "output.mp4", fps=8)
7389
```
7490

75-
Reduce memory usage even more if necessary by quantizing a model to a lower precision data type.
76-
77-
```py
78-
import torch
79-
from diffusers import CogVideoXPipeline, CogVideoXTransformer3DModel, TorchAoConfig
80-
from diffusers.utils import export_to_video
81-
82-
# quantize weights to int8 with torchao
83-
quantization_config = TorchAoConfig("int8wo")
84-
transformer = CogVideoXTransformer3DModel.from_pretrained(
85-
"THUDM/CogVideoX-5b",
86-
subfolder="transformer",
87-
quantization_config=quantization_config,
88-
torch_dtype=torch.bfloat16,
89-
)
90-
# fp8 layerwise weight-casting
91-
transformer.enable_layerwise_casting(
92-
storage_dtype=torch.float8_e4m3fn,
93-
compute_dtype=torch.bfloat16
94-
)
95-
96-
pipeline = CogVideoXPipeline.from_pretrained(
97-
"THUDM/CogVideoX-5b",
98-
transformer=transformer,
99-
torch_dtype=torch.bfloat16,
100-
)
101-
pipeline.to("cuda")
102-
103-
# model-offloading
104-
pipeline.enable_model_cpu_offload()
105-
106-
prompt = ("A detailed wooden toy ship with intricately carved masts and sails is seen gliding smoothly over a plush, blue carpet that mimics the waves of the sea. "
107-
"The ship's hull is painted a rich brown, with tiny windows. The carpet, soft and textured, provides a perfect backdrop, resembling an oceanic expanse. "
108-
"Surrounding the ship are various other toys and children's items, hinting at a playful environment. The scene captures the innocence and imagination of childhood, "
109-
"with the toy ship's journey symbolizing endless adventures in a whimsical, indoor setting.")
110-
video = pipeline(prompt=prompt, guidance_scale=6, num_inference_steps=50).frames[0]
111-
export_to_video(video, "output.mp4", fps=8)
112-
```
113-
11491
</hfoption>
11592
<hfoption id="inference speed">
11693

@@ -119,7 +96,6 @@ Compilation is slow the first time but subsequent calls to the pipeline are fast
11996
```py
12097
import torch
12198
from diffusers import CogVideoXPipeline, CogVideoXTransformer3DModel
122-
from diffusers.hooks import apply_group_offloading
12399
from diffusers.utils import export_to_video
124100

125101
pipeline = CogVideoXPipeline.from_pretrained(
@@ -133,10 +109,13 @@ pipeline.transformer = torch.compile(
133109
pipeline.transformer, mode="max-autotune", fullgraph=True
134110
)
135111

136-
prompt = ("A detailed wooden toy ship with intricately carved masts and sails is seen gliding smoothly over a plush, blue carpet that mimics the waves of the sea. "
137-
"The ship's hull is painted a rich brown, with tiny windows. The carpet, soft and textured, provides a perfect backdrop, resembling an oceanic expanse. "
138-
"Surrounding the ship are various other toys and children's items, hinting at a playful environment. The scene captures the innocence and imagination of childhood, "
139-
"with the toy ship's journey symbolizing endless adventures in a whimsical, indoor setting.")
112+
prompt = """
113+
A detailed wooden toy ship with intricately carved masts and sails is seen gliding smoothly over a plush, blue carpet that mimics the waves of the sea.
114+
The ship's hull is painted a rich brown, with tiny windows. The carpet, soft and textured, provides a perfect backdrop, resembling an oceanic expanse.
115+
Surrounding the ship are various other toys and children's items, hinting at a playful environment. The scene captures the innocence and imagination of childhood,
116+
with the toy ship's journey symbolizing endless adventures in a whimsical, indoor setting.
117+
"""
118+
140119
video = pipeline(
141120
prompt=prompt,
142121
guidance_scale=6,
@@ -186,8 +165,11 @@ export_to_video(video, "output.mp4", fps=8)
186165
).frames[0]
187166
export_to_video(video, "output.mp4", fps=16)
188167
```
168+
189169
- The text-to-video (T2V) checkpoints work best with a resolution of 1360x768 because that was the resolution it was pretrained on.
170+
190171
- The image-to-video (I2V) checkpoints work with multiple resolutions. The width can vary from 768 to 1360, but the height must be 758. Both height and width must be divisible by 16.
172+
191173
- Both T2V and I2V checkpoints work best with 81 and 161 frames. It is recommended to export the generated video at 16fps.
192174

193175
## CogVideoXPipeline

docs/source/en/api/pipelines/hunyuan_video.md

+12-7
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020

2121
# HunyuanVideo
2222

23-
[HunyuanVideo](https://huggingface.co/papers/2412.03603) is a 13B diffusion transformer model designed to be competitive with closed-source video foundation models and enable wider community access. This model uses a "dual-stream to single-stream" architecture to separately process the video and text tokens first, before concatenating and feeding them to the transformer to fuse the multimodal information. A pretrained multimodal large language model (MLLM) is used as the encoder because it has better image-text alignment, better image detail description and reasoning, and it can be used as a zero-shot learner if system instructions are added to user prompts. Finally, HunyuanVideo uses a 3D causal variational autoencoder to more efficiently process video data at the original resolution and frame rate.
23+
[HunyuanVideo](https://huggingface.co/papers/2412.03603) is a 13B parameter diffusion transformer model designed to be competitive with closed-source video foundation models and enable wider community access. This model uses a "dual-stream to single-stream" architecture to separately process the video and text tokens first, before concatenating and feeding them to the transformer to fuse the multimodal information. A pretrained multimodal large language model (MLLM) is used as the encoder because it has better image-text alignment, better image detail description and reasoning, and it can be used as a zero-shot learner if system instructions are added to user prompts. Finally, HunyuanVideo uses a 3D causal variational autoencoder to more efficiently process video data at the original resolution and frame rate.
2424

2525
You can find all the original HunyuanVideo checkpoints under the [Tencent](https://huggingface.co/tencent) organization.
2626

@@ -32,12 +32,16 @@ The example below demonstrates how to generate a video optimized for memory or i
3232
<hfoptions id="usage">
3333
<hfoption id="memory">
3434

35+
Refer to the [Reduce memory usage](../../optimization/memory) guide for more details about the various memory saving techniques.
36+
37+
The quantized HunyuanVideo model below requires ~14GB of VRAM.
38+
3539
```py
3640
import torch
3741
from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig, HunyuanVideoTransformer3DModel, HunyuanVideoPipeline
3842
from diffusers.utils import export_to_video
3943

40-
# quantization
44+
# quantize weights to int4 with bitsandbytes
4145
quant_config = DiffusersBitsAndBytesConfig(load_in_4bit=True)
4246
transformer = HunyuanVideoTransformer3DModel.from_pretrained(
4347
"hunyuanvideo-community/HunyuanVideo",
@@ -52,7 +56,7 @@ pipeline = HunyuanVideoPipeline.from_pretrained(
5256
torch_dtype=torch.float16,
5357
)
5458

55-
# model-offloading
59+
# model-offloading and tiling
5660
pipeline.enable_model_cpu_offload()
5761
pipeline.vae.enable_tiling()
5862

@@ -71,7 +75,7 @@ import torch
7175
from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig, HunyuanVideoTransformer3DModel, HunyuanVideoPipeline
7276
from diffusers.utils import export_to_video
7377

74-
# quantization
78+
# quantize weights to int4 with bitsandbytes
7579
quant_config = DiffusersBitsAndBytesConfig(load_in_4bit=True)
7680
transformer = HunyuanVideoTransformer3DModel.from_pretrained(
7781
"hunyuanvideo-community/HunyuanVideo",
@@ -86,7 +90,7 @@ pipeline = HunyuanVideoPipeline.from_pretrained(
8690
torch_dtype=torch.float16,
8791
)
8892

89-
# model-offloading
93+
# model-offloading and tiling
9094
pipeline.enable_model_cpu_offload()
9195
pipeline.vae.enable_tiling()
9296

@@ -132,10 +136,11 @@ export_to_video(video, "output.mp4", fps=15)
132136
pipeline.load_lora_weights("https://huggingface.co/lucataco/hunyuan-steamboat-willie-10", adapter_name="steamboat-willie")
133137
pipeline.set_adapters("steamboat-willie", 0.9)
134138

135-
# model-offloading
139+
# model-offloading and tiling
136140
pipeline.enable_model_cpu_offload()
137141
pipeline.vae.enable_tiling()
138142

143+
# use "In the style of SWR" to trigger the LoRA
139144
prompt = """
140145
In the style of SWR. A black and white animated scene featuring a fluffy teddy bear sits on a bed of soft pillows surrounded by children's toys.
141146
"""
@@ -150,7 +155,7 @@ export_to_video(video, "output.mp4", fps=15)
150155
| text encoder dtype | `torch.float16` |
151156
| transformer dtype | `torch.bfloat16` |
152157
| vae dtype | `torch.float16` |
153-
| `num_frames` | 4 * k + 1 |
158+
| `num_frames (k)` | 4 * `k` + 1 |
154159

155160
- Try lower `shift` values (`2.0` to `5.0`) for lower resolution videos, and try higher `shift` values (`7.0` to `12.0`) for higher resolution images.
156161

docs/source/en/api/pipelines/ltx_video.md

+22-57
Original file line numberDiff line numberDiff line change
@@ -20,18 +20,22 @@
2020

2121
# LTX-Video
2222

23-
[LTX-Video](https://huggingface.co/Lightricks/LTX-Video) is a diffusion transformer designed for fast and real-time generation of high-resolution videos from text and images. The main feature of LTX-Video is the Video-VAE. The Video-VAE has a higher pixel to latent compression ratio (1:192) which enables more efficient video data processing and faster generation speed. To support and prevent the finer details from being lost during generation, the Video-VAE decoder performs the latent to pixel conversion *and* the last denoising step.
23+
[LTX-Video](https://huggingface.co/Lightricks/LTX-Video) is a diffusion transformer designed for fast and real-time generation of high-resolution videos from text and images. The main feature of LTX-Video is the Video-VAE. The Video-VAE has a higher pixel to latent compression ratio (1:192) which enables more efficient video data processing and faster generation speed. To support and prevent finer details from being lost during generation, the Video-VAE decoder performs the latent to pixel conversion *and* the last denoising step.
2424

2525
You can find all the original LTX-Video checkpoints under the [Lightricks](https://huggingface.co/Lightricks) organization.
2626

2727
> [!TIP]
28-
> Click on the LTX-Video models in the right sidebar for more examples of how to use LTX-Video for other video generation tasks.
28+
> Click on the LTX-Video models in the right sidebar for more examples of other video generation tasks.
2929
3030
The example below demonstrates how to generate a video optimized for memory or inference speed.
3131

3232
<hfoptions id="usage">
3333
<hfoption id="memory">
3434

35+
Refer to the [Reduce memory usage](../../optimization/memory) guide for more details about the various memory saving techniques.
36+
37+
The LTX-Video model below requires ~10GB of VRAM.
38+
3539
```py
3640
import torch
3741
from diffusers import LTXPipeline, LTXVideoTransformer3DModel
@@ -58,7 +62,9 @@ pipeline.transformer.enable_group_offload(onload_device=onload_device, offload_d
5862
apply_group_offloading(pipeline.text_encoder, onload_device=onload_device, offload_type="block_level", num_blocks_per_group=2)
5963
apply_group_offloading(pipeline.vae, onload_device=onload_device, offload_type="leaf_level")
6064

61-
prompt = "A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage"
65+
prompt = """
66+
A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage
67+
"""
6268
negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"
6369

6470
video = pipeline(
@@ -74,53 +80,6 @@ video = pipeline(
7480
export_to_video(video, "output.mp4", fps=24)
7581
```
7682

77-
Reduce memory usage even more if necessary by quantizing a model to a lower precision data type.
78-
79-
```py
80-
import torch
81-
from diffusers.utils import export_to_video
82-
from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig, LTXVideoTransformer3DModel, LTXPipeline
83-
from transformers import BitsAndBytesConfig as BitsAndBytesConfig, T5EncoderModel
84-
85-
# quantize weights to int8 with bitsandbytes
86-
quantization_config = BitsAndBytesConfig(load_in_8bit=True)
87-
text_encoder = T5EncoderModel.from_pretrained(
88-
"Lightricks/LTX-Video",
89-
subfolder="text_encoder",
90-
quantization_config=quantization_config,
91-
torch_dtype=torch.bfloat16,
92-
)
93-
94-
quantization_config = DiffusersBitsAndBytesConfig(load_in_8bit=True)
95-
transformer = LTXVideoTransformer3DModel.from_pretrained(
96-
"Lightricks/LTX-Video",
97-
subfolder="transformer",
98-
quantization_config=quantization_config,
99-
torch_dtype=torch.bfloat16,
100-
)
101-
102-
pipeline = LTXPipeline.from_pretrained(
103-
"Lightricks/LTX-Video",
104-
text_encoder=text_en,
105-
transformer=transformer,
106-
torch_dtype=torch.bfloat16,
107-
)
108-
109-
prompt = "A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage"
110-
negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"
111-
video = pipeline(
112-
prompt=prompt,
113-
negative_prompt=negative_prompt,
114-
width=768,
115-
height=512,
116-
num_frames=161,
117-
decode_timestep=0.03,
118-
decode_noise_scale=0.025,
119-
num_inference_steps=50,
120-
).frames[0]
121-
export_to_video(video, "output.mp4", fps=24)
122-
```
123-
12483
</hfoption>
12584
<hfoption id="inference speed">
12685

@@ -141,7 +100,9 @@ pipeline.transformer = torch.compile(
141100
pipeline.transformer, mode="max-autotune", fullgraph=True
142101
)
143102

144-
prompt = "A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage"
103+
prompt = """
104+
A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage
105+
"""
145106
negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"
146107

147108
video = pipeline(
@@ -162,24 +123,27 @@ export_to_video(video, "output.mp4", fps=24)
162123

163124
## Notes
164125

165-
- LTX-Video supports LoRAs with [`~LTXVideoLoraLoaderMixin.load_lora_weights`].
126+
- LTX-Video supports LoRAs with [`~loaders.LTXVideoLoraLoaderMixin.load_lora_weights`].
166127

167128
```py
168129
import torch
169130
from diffusers import LTXConditionPipeline
170-
from diffusers.utils import export_to_video
131+
from diffusers.utils import export_to_video, load_image
171132

172133
pipeline = LTXConditionPipeline.from_pretrained(
173134
"Lightricks/LTX-Video-0.9.5", torch_dtype=torch.bfloat16
174135
)
175136

176137
pipeline.load_lora_weights("Lightricks/LTX-Video-Cakeify-LoRA", adapter_name="cakeify")
177-
pipeline.set_adapters("cakeify", 0.9)
138+
pipeline.set_adapters("cakeify")
178139

179-
prompt = "CAKEIFY a person using a knife to cut a cake shaped like a pair of cowboy boots"
140+
# use "CAKEIFY" to trigger the LoRA
141+
prompt = "CAKEIFY a person using a knife to cut a cake shaped like a cereal box"
142+
image = load_image("https://i5.walmartimages.com/asr/c0463def-4995-47a7-9486-294fff8cf9fc.f9779f3fc4c621cf1fe86465af1d2ecd.jpeg")
180143

181144
video = pipeline(
182145
prompt=prompt,
146+
image=image,
183147
width=768,
184148
height=512,
185149
num_frames=161,
@@ -189,7 +153,8 @@ export_to_video(video, "output.mp4", fps=24)
189153
).frames[0]
190154
export_to_video(video, "output.mp4", fps=24)
191155
```
192-
- LTX-Video supports loading from single files, such as [GGUF checkpoints](../../quantization/gguf), with [`FromOriginalModelMixin.from_single_file`] or [`FromSingleFileMixin.from_single_file`].
156+
157+
- LTX-Video supports loading from single files, such as [GGUF checkpoints](../../quantization/gguf), with [`loaders.FromOriginalModelMixin.from_single_file`] or [`loaders.FromSingleFileMixin.from_single_file`].
193158

194159
```py
195160
import torch
@@ -204,7 +169,7 @@ export_to_video(video, "output.mp4", fps=24)
204169
pipeline = LTXPipeline.from_pretrained(
205170
"Lightricks/LTX-Video",
206171
transformer=transformer,
207-
torch_dtype=bfloat16
172+
torch_dtype=torch.bfloat16
208173
)
209174
```
210175

0 commit comments

Comments
 (0)