wan

stevhliu · stevhliu · commit b31ed8e437e0 · 2025-03-27T11:11:34.000-07:00
diff --git a/docs/source/en/api/loaders/lora.md b/docs/source/en/api/loaders/lora.md
@@ -79,4 +79,8 @@ To learn more about how to load LoRA weights, see the [LoRA](../../using-diffuse
 
 ## LoraBaseMixin
 
-[[autodoc]] loaders.lora_base.LoraBaseMixin
+[[autodoc]] loaders.lora_base.LoraBaseMixin
+
+## WanLoraLoaderMixin
+
+[[autodoc]] loaders.lora_pipeline.WanLoraLoaderMixin
diff --git a/docs/source/en/api/pipelines/cogvideox.md b/docs/source/en/api/pipelines/cogvideox.md
@@ -26,19 +26,32 @@
 You can find all the original CogVideoX checkpoints under the [CogVideoX](https://huggingface.co/collections/THUDM/cogvideo-66c08e62f1685a3ade464cce) collection.
 
 > [!TIP]
-> Click on the CogVideoX models in the right sidebar for more examples of how to use CogVideoX for other video generation tasks.
+> Click on the CogVideoX models in the right sidebar for more examples of other video generation tasks.
 
 The example below demonstrates how to generate a video optimized for memory or inference speed.
 
 <hfoptions id="usage">
 <hfoption id="memory">
 
+Refer to the [Reduce memory usage](../../optimization/memory) guide for more details about the various memory saving techniques.
+
+The quantized CogVideoX 5B model below requires ~16GB of VRAM.
+
 ```py
 import torch
 from diffusers import CogVideoXPipeline, CogVideoXTransformer3DModel
 from diffusers.hooks import apply_group_offloading
 from diffusers.utils import export_to_video
 
+# quantize weights to int8 with torchao
+quantization_config = TorchAoConfig("int8wo")
+transformer = CogVideoXTransformer3DModel.from_pretrained(
+    "THUDM/CogVideoX-5b",
+    subfolder="transformer",
+    quantization_config=quantization_config,
+    torch_dtype=torch.bfloat16,
+)
+
 # fp8 layerwise weight-casting
 transformer = CogVideoXTransformer3DModel.from_pretrained(
   "THUDM/CogVideoX-5b",
@@ -60,10 +73,13 @@ pipeline.to("cuda")
 # model-offloading
 pipeline.enable_model_cpu_offload()
 
-prompt = ("A detailed wooden toy ship with intricately carved masts and sails is seen gliding smoothly over a plush, blue carpet that mimics the waves of the sea. "
-          "The ship's hull is painted a rich brown, with tiny windows. The carpet, soft and textured, provides a perfect backdrop, resembling an oceanic expanse. "
-          "Surrounding the ship are various other toys and children's items, hinting at a playful environment. The scene captures the innocence and imagination of childhood, "
-          "with the toy ship's journey symbolizing endless adventures in a whimsical, indoor setting.")
+prompt = """
+A detailed wooden toy ship with intricately carved masts and sails is seen gliding smoothly over a plush, blue carpet that mimics the waves of the sea. 
+The ship's hull is painted a rich brown, with tiny windows. The carpet, soft and textured, provides a perfect backdrop, resembling an oceanic expanse. 
+Surrounding the ship are various other toys and children's items, hinting at a playful environment. The scene captures the innocence and imagination of childhood, 
+with the toy ship's journey symbolizing endless adventures in a whimsical, indoor setting.
+"""
+
 video = pipeline(
   prompt=prompt,
   guidance_scale=6,
@@ -72,45 +88,6 @@ video = pipeline(
 export_to_video(video, "output.mp4", fps=8)
 ```
 
-Reduce memory usage even more if necessary by quantizing a model to a lower precision data type.
-
-```py
-import torch
-from diffusers import CogVideoXPipeline, CogVideoXTransformer3DModel, TorchAoConfig
-from diffusers.utils import export_to_video
-
-# quantize weights to int8 with torchao
-quantization_config = TorchAoConfig("int8wo")
-transformer = CogVideoXTransformer3DModel.from_pretrained(
-    "THUDM/CogVideoX-5b",
-    subfolder="transformer",
-    quantization_config=quantization_config,
-    torch_dtype=torch.bfloat16,
-)
-# fp8 layerwise weight-casting
-transformer.enable_layerwise_casting(
-  storage_dtype=torch.float8_e4m3fn,
-  compute_dtype=torch.bfloat16
-)
-
-pipeline = CogVideoXPipeline.from_pretrained(
-    "THUDM/CogVideoX-5b",
-    transformer=transformer,
-    torch_dtype=torch.bfloat16,
-)
-pipeline.to("cuda")
-
-# model-offloading
-pipeline.enable_model_cpu_offload()
-
-prompt = ("A detailed wooden toy ship with intricately carved masts and sails is seen gliding smoothly over a plush, blue carpet that mimics the waves of the sea. "
-          "The ship's hull is painted a rich brown, with tiny windows. The carpet, soft and textured, provides a perfect backdrop, resembling an oceanic expanse. "
-          "Surrounding the ship are various other toys and children's items, hinting at a playful environment. The scene captures the innocence and imagination of childhood, "
-          "with the toy ship's journey symbolizing endless adventures in a whimsical, indoor setting.")
-video = pipeline(prompt=prompt, guidance_scale=6, num_inference_steps=50).frames[0]
-export_to_video(video, "output.mp4", fps=8)
-```
-
 </hfoption>
 <hfoption id="inference speed">
 
@@ -119,7 +96,6 @@ Compilation is slow the first time but subsequent calls to the pipeline are fast
 ```py
 import torch
 from diffusers import CogVideoXPipeline, CogVideoXTransformer3DModel
-from diffusers.hooks import apply_group_offloading
 from diffusers.utils import export_to_video
 
 pipeline = CogVideoXPipeline.from_pretrained(
@@ -133,10 +109,13 @@ pipeline.transformer = torch.compile(
     pipeline.transformer, mode="max-autotune", fullgraph=True
 )
 
-prompt = ("A detailed wooden toy ship with intricately carved masts and sails is seen gliding smoothly over a plush, blue carpet that mimics the waves of the sea. "
-          "The ship's hull is painted a rich brown, with tiny windows. The carpet, soft and textured, provides a perfect backdrop, resembling an oceanic expanse. "
-          "Surrounding the ship are various other toys and children's items, hinting at a playful environment. The scene captures the innocence and imagination of childhood, "
-          "with the toy ship's journey symbolizing endless adventures in a whimsical, indoor setting.")
+prompt = """
+A detailed wooden toy ship with intricately carved masts and sails is seen gliding smoothly over a plush, blue carpet that mimics the waves of the sea. 
+The ship's hull is painted a rich brown, with tiny windows. The carpet, soft and textured, provides a perfect backdrop, resembling an oceanic expanse. 
+Surrounding the ship are various other toys and children's items, hinting at a playful environment. The scene captures the innocence and imagination of childhood, 
+with the toy ship's journey symbolizing endless adventures in a whimsical, indoor setting.
+"""
+
 video = pipeline(
   prompt=prompt,
   guidance_scale=6,
@@ -186,8 +165,11 @@ export_to_video(video, "output.mp4", fps=8)
   ).frames[0]
   export_to_video(video, "output.mp4", fps=16)
   ```
+
 - The text-to-video (T2V) checkpoints work best with a resolution of 1360x768 because that was the resolution it was pretrained on.
+
 - The image-to-video (I2V) checkpoints work with multiple resolutions. The width can vary from 768 to 1360, but the height must be 758. Both height and width must be divisible by 16.
+
 - Both T2V and I2V checkpoints work best with 81 and 161 frames. It is recommended to export the generated video at 16fps.
  
 ## CogVideoXPipeline
diff --git a/docs/source/en/api/pipelines/hunyuan_video.md b/docs/source/en/api/pipelines/hunyuan_video.md
@@ -20,7 +20,7 @@
 
 # HunyuanVideo
 
-[HunyuanVideo](https://huggingface.co/papers/2412.03603) is a 13B diffusion transformer model designed to be competitive with closed-source video foundation models and enable wider community access. This model uses a "dual-stream to single-stream" architecture to separately process the video and text tokens first, before concatenating and feeding them to the transformer to fuse the multimodal information. A pretrained multimodal large language model (MLLM) is used as the encoder because it has better image-text alignment, better image detail description and reasoning, and it can be used as a zero-shot learner if system instructions are added to user prompts. Finally, HunyuanVideo uses a 3D causal variational autoencoder to more efficiently process video data at the original resolution and frame rate.
+[HunyuanVideo](https://huggingface.co/papers/2412.03603) is a 13B parameter diffusion transformer model designed to be competitive with closed-source video foundation models and enable wider community access. This model uses a "dual-stream to single-stream" architecture to separately process the video and text tokens first, before concatenating and feeding them to the transformer to fuse the multimodal information. A pretrained multimodal large language model (MLLM) is used as the encoder because it has better image-text alignment, better image detail description and reasoning, and it can be used as a zero-shot learner if system instructions are added to user prompts. Finally, HunyuanVideo uses a 3D causal variational autoencoder to more efficiently process video data at the original resolution and frame rate.
 
 You can find all the original HunyuanVideo checkpoints under the [Tencent](https://huggingface.co/tencent) organization.
 
@@ -32,12 +32,16 @@ The example below demonstrates how to generate a video optimized for memory or i
 <hfoptions id="usage">
 <hfoption id="memory">
 
+Refer to the [Reduce memory usage](../../optimization/memory) guide for more details about the various memory saving techniques.
+
+The quantized HunyuanVideo model below requires ~14GB of VRAM.
+
 ```py
 import torch
 from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig, HunyuanVideoTransformer3DModel, HunyuanVideoPipeline
 from diffusers.utils import export_to_video
 
-# quantization
+# quantize weights to int4 with bitsandbytes
 quant_config = DiffusersBitsAndBytesConfig(load_in_4bit=True)
 transformer = HunyuanVideoTransformer3DModel.from_pretrained(
     "hunyuanvideo-community/HunyuanVideo",
@@ -52,7 +56,7 @@ pipeline = HunyuanVideoPipeline.from_pretrained(
     torch_dtype=torch.float16,
 )
 
-# model-offloading
+# model-offloading and tiling
 pipeline.enable_model_cpu_offload()
 pipeline.vae.enable_tiling()
 
@@ -71,7 +75,7 @@ import torch
 from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig, HunyuanVideoTransformer3DModel, HunyuanVideoPipeline
 from diffusers.utils import export_to_video
 
-# quantization
+# quantize weights to int4 with bitsandbytes
 quant_config = DiffusersBitsAndBytesConfig(load_in_4bit=True)
 transformer = HunyuanVideoTransformer3DModel.from_pretrained(
     "hunyuanvideo-community/HunyuanVideo",
@@ -86,7 +90,7 @@ pipeline = HunyuanVideoPipeline.from_pretrained(
     torch_dtype=torch.float16,
 )
 
-# model-offloading
+# model-offloading and tiling
 pipeline.enable_model_cpu_offload()
 pipeline.vae.enable_tiling()
 
@@ -132,10 +136,11 @@ export_to_video(video, "output.mp4", fps=15)
   pipeline.load_lora_weights("https://huggingface.co/lucataco/hunyuan-steamboat-willie-10", adapter_name="steamboat-willie")
   pipeline.set_adapters("steamboat-willie", 0.9)
 
-  # model-offloading
+  # model-offloading and tiling
   pipeline.enable_model_cpu_offload()
   pipeline.vae.enable_tiling()
 
+  # use "In the style of SWR" to trigger the LoRA
   prompt = """
   In the style of SWR. A black and white animated scene featuring a fluffy teddy bear sits on a bed of soft pillows surrounded by children's toys.
   """
@@ -150,7 +155,7 @@ export_to_video(video, "output.mp4", fps=15)
   | text encoder dtype | `torch.float16` |
   | transformer dtype | `torch.bfloat16` |
   | vae dtype | `torch.float16` |
-  | `num_frames` | 4 * k + 1 |
+  | `num_frames (k)` | 4 * `k` + 1 |
 
 - Try lower `shift` values (`2.0` to `5.0`) for lower resolution videos, and try higher `shift` values (`7.0` to `12.0`) for higher resolution images.
 
diff --git a/docs/source/en/api/pipelines/ltx_video.md b/docs/source/en/api/pipelines/ltx_video.md
@@ -20,18 +20,22 @@
 
 # LTX-Video
 
-[LTX-Video](https://huggingface.co/Lightricks/LTX-Video) is a diffusion transformer designed for fast and real-time generation of high-resolution videos from text and images. The main feature of LTX-Video is the Video-VAE. The Video-VAE has a higher pixel to latent compression ratio (1:192) which enables more efficient video data processing and faster generation speed. To support and prevent the finer details from being lost during generation, the Video-VAE decoder performs the latent to pixel conversion *and* the last denoising step.
+[LTX-Video](https://huggingface.co/Lightricks/LTX-Video) is a diffusion transformer designed for fast and real-time generation of high-resolution videos from text and images. The main feature of LTX-Video is the Video-VAE. The Video-VAE has a higher pixel to latent compression ratio (1:192) which enables more efficient video data processing and faster generation speed. To support and prevent finer details from being lost during generation, the Video-VAE decoder performs the latent to pixel conversion *and* the last denoising step.
 
 You can find all the original LTX-Video checkpoints under the [Lightricks](https://huggingface.co/Lightricks) organization.
 
 > [!TIP]
-> Click on the LTX-Video models in the right sidebar for more examples of how to use LTX-Video for other video generation tasks.
+> Click on the LTX-Video models in the right sidebar for more examples of other video generation tasks.
 
 The example below demonstrates how to generate a video optimized for memory or inference speed.
 
 <hfoptions id="usage">
 <hfoption id="memory">
 
+Refer to the [Reduce memory usage](../../optimization/memory) guide for more details about the various memory saving techniques.
+
+The LTX-Video model below requires ~10GB of VRAM.
+
 ```py
 import torch
 from diffusers import LTXPipeline, LTXVideoTransformer3DModel
@@ -58,7 +62,9 @@ pipeline.transformer.enable_group_offload(onload_device=onload_device, offload_d
 apply_group_offloading(pipeline.text_encoder, onload_device=onload_device, offload_type="block_level", num_blocks_per_group=2)
 apply_group_offloading(pipeline.vae, onload_device=onload_device, offload_type="leaf_level")
 
-prompt = "A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage"
+prompt = """
+A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage
+"""
 negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"
 
 video = pipeline(
@@ -74,53 +80,6 @@ video = pipeline(
 export_to_video(video, "output.mp4", fps=24)
 ```
 
-Reduce memory usage even more if necessary by quantizing a model to a lower precision data type.
-
-```py
-import torch
-from diffusers.utils import export_to_video
-from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig, LTXVideoTransformer3DModel, LTXPipeline
-from transformers import BitsAndBytesConfig as BitsAndBytesConfig, T5EncoderModel
-
-# quantize weights to int8 with bitsandbytes
-quantization_config = BitsAndBytesConfig(load_in_8bit=True)
-text_encoder = T5EncoderModel.from_pretrained(
-    "Lightricks/LTX-Video",
-    subfolder="text_encoder",
-    quantization_config=quantization_config,
-    torch_dtype=torch.bfloat16,
-)
-
-quantization_config = DiffusersBitsAndBytesConfig(load_in_8bit=True)
-transformer = LTXVideoTransformer3DModel.from_pretrained(
-    "Lightricks/LTX-Video",
-    subfolder="transformer",
-    quantization_config=quantization_config,
-    torch_dtype=torch.bfloat16,
-)
-
-pipeline = LTXPipeline.from_pretrained(
-    "Lightricks/LTX-Video",
-    text_encoder=text_en,
-    transformer=transformer,
-    torch_dtype=torch.bfloat16,
-)
-
-prompt = "A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage"
-negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"
-video = pipeline(
-    prompt=prompt,
-    negative_prompt=negative_prompt,
-    width=768,
-    height=512,
-    num_frames=161,
-    decode_timestep=0.03,
-    decode_noise_scale=0.025,
-    num_inference_steps=50,
-).frames[0]
-export_to_video(video, "output.mp4", fps=24)
-```
-
 </hfoption>
 <hfoption id="inference speed">
 
@@ -141,7 +100,9 @@ pipeline.transformer = torch.compile(
     pipeline.transformer, mode="max-autotune", fullgraph=True
 )
 
-prompt = "A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage"
+prompt = """
+A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage
+"""
 negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"
 
 video = pipeline(
@@ -162,24 +123,27 @@ export_to_video(video, "output.mp4", fps=24)
 
 ## Notes
 
-- LTX-Video supports LoRAs with [`~LTXVideoLoraLoaderMixin.load_lora_weights`].
+- LTX-Video supports LoRAs with [`~loaders.LTXVideoLoraLoaderMixin.load_lora_weights`].
 
   ```py
   import torch
   from diffusers import LTXConditionPipeline
-  from diffusers.utils import export_to_video
+  from diffusers.utils import export_to_video, load_image
 
   pipeline = LTXConditionPipeline.from_pretrained(
       "Lightricks/LTX-Video-0.9.5", torch_dtype=torch.bfloat16
   )
 
   pipeline.load_lora_weights("Lightricks/LTX-Video-Cakeify-LoRA", adapter_name="cakeify")
-  pipeline.set_adapters("cakeify", 0.9)
+  pipeline.set_adapters("cakeify")
 
-  prompt = "CAKEIFY a person using a knife to cut a cake shaped like a pair of cowboy boots"
+  # use "CAKEIFY" to trigger the LoRA
+  prompt = "CAKEIFY a person using a knife to cut a cake shaped like a cereal box"
+  image = load_image("https://i5.walmartimages.com/asr/c0463def-4995-47a7-9486-294fff8cf9fc.f9779f3fc4c621cf1fe86465af1d2ecd.jpeg")
 
   video = pipeline(
       prompt=prompt,
+      image=image,
       width=768,
       height=512,
       num_frames=161,
@@ -189,7 +153,8 @@ export_to_video(video, "output.mp4", fps=24)
   ).frames[0]
   export_to_video(video, "output.mp4", fps=24)
   ```
-- LTX-Video supports loading from single files, such as [GGUF checkpoints](../../quantization/gguf), with [`FromOriginalModelMixin.from_single_file`] or [`FromSingleFileMixin.from_single_file`].
+
+- LTX-Video supports loading from single files, such as [GGUF checkpoints](../../quantization/gguf), with [`loaders.FromOriginalModelMixin.from_single_file`] or [`loaders.FromSingleFileMixin.from_single_file`].
 
   ```py
   import torch
@@ -204,7 +169,7 @@ export_to_video(video, "output.mp4", fps=24)
   pipeline = LTXPipeline.from_pretrained(
     "Lightricks/LTX-Video",
     transformer=transformer,
-    torch_dtype=bfloat16
+    torch_dtype=torch.bfloat16
   )
   ```
 
diff --git a/docs/source/en/api/pipelines/wan.md b/docs/source/en/api/pipelines/wan.md