Source: examples/inference/basic
Basic Video Generation Tutorial¶
The VideoGenerator class provides the primary Python interface for doing offline video generation, which is interacting with a diffusion pipeline without using a separate inference api server.
Requirements¶
- At least a single NVIDIA GPU with CUDA 12.4.
- Python 3.10-3.12
Installation¶
If you have not installed FastVideo, please following these instructions first.
Usage¶
The first script in this example shows the most basic usage of FastVideo. If you are new to Python and FastVideo, you should start here.
# if you have not cloned the directory:
git clone https://github.com/hao-ai-lab/FastVideo.git && cd FastVideo
python examples/inference/basic/basic.py
For an example on Apple silicon:
For an example running DMD+VSA inference:
Basic Walkthrough¶
All you need to generate videos using multi-gpus from state-of-the-art diffusion pipelines is the following few lines!
from fastvideo import VideoGenerator
def main():
generator = VideoGenerator.from_pretrained(
"Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
num_gpus=1,
)
prompt = ("A curious raccoon peers through a vibrant field of yellow sunflowers, its eyes "
"wide with interest. The playful yet serene atmosphere is complemented by soft "
"natural light filtering through the petals. Mid-shot, warm and cheerful tones.")
video = generator.generate_video(prompt)
if __name__ == "__main__":
main()
Additional Files¶
basic.py
from fastvideo import VideoGenerator
# from fastvideo.configs.sample import SamplingParam
OUTPUT_PATH = "video_samples"
def main():
# FastVideo will automatically use the optimal default arguments for the
# model.
# If a local path is provided, FastVideo will make a best effort
# attempt to identify the optimal arguments.
generator = VideoGenerator.from_pretrained(
"Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
# FastVideo will automatically handle distributed setup
num_gpus=1,
use_fsdp_inference=False, # set to True if GPU is out of memory
dit_cpu_offload=False,
vae_cpu_offload=False,
text_encoder_cpu_offload=True,
pin_cpu_memory=True, # set to false if low CPU RAM or hit obscure "CUDA error: Invalid argument"
# image_encoder_cpu_offload=False,
)
# sampling_param = SamplingParam.from_pretrained("Wan-AI/Wan2.1-T2V-1.3B-Diffusers")
# sampling_param.num_frames = 45
# sampling_param.image_path = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/astronaut.jpg"
# Generate videos with the same simple API, regardless of GPU count
prompt = (
"A curious raccoon peers through a vibrant field of yellow sunflowers, its eyes "
"wide with interest. The playful yet serene atmosphere is complemented by soft "
"natural light filtering through the petals. Mid-shot, warm and cheerful tones."
)
video = generator.generate_video(prompt, output_path=OUTPUT_PATH, save_video=True)
# video = generator.generate_video(prompt, sampling_param=sampling_param, output_path="wan_t2v_videos/")
# Generate another video with a different prompt, without reloading the
# model!
prompt2 = (
"A majestic lion strides across the golden savanna, its powerful frame "
"glistening under the warm afternoon sun. The tall grass ripples gently in "
"the breeze, enhancing the lion's commanding presence. The tone is vibrant, "
"embodying the raw energy of the wild. Low angle, steady tracking shot, "
"cinematic.")
video2 = generator.generate_video(prompt2, output_path=OUTPUT_PATH, save_video=True)
if __name__ == "__main__":
main()
basic_cosmos2_5_i2w.py
# SPDX-License-Identifier: Apache-2.0
from fastvideo import VideoGenerator
from fastvideo.configs.sample import SamplingParam
def main():
# Point this to your local diffusers model dir (or replace with a HF model ID).
model_path = "KyleShao/Cosmos-Predict2.5-2B-Diffusers"
generator = VideoGenerator.from_pretrained(
model_path,
num_gpus=1,
use_fsdp_inference=False, # set True if GPU is out of memory
dit_cpu_offload=False,
vae_cpu_offload=False,
text_encoder_cpu_offload=True,
pin_cpu_memory=True,
)
sampling_param = SamplingParam.from_pretrained(model_path)
# image2world example from official repo
image_path = "assets/images/bus_terminal.jpg"
prompt = (
"A nighttime city bus terminal gradually shifts from stillness to subtle movement. "
"At first, multiple double-decker buses are parked under the glow of overhead lights, "
"with a central bus labeled '87D' facing forward and stationary. "
"As the video progresses, the bus in the middle moves ahead slowly, its headlights brightening the surrounding area "
"and casting reflections onto adjacent vehicles. "
"The motion creates space in the lineup, signaling activity within the otherwise quiet station. "
"It then comes to a smooth stop, resuming its position in line. "
"Overhead signage in Chinese characters remains illuminated, enhancing the vibrant, urban night scene."
)
generator.generate_video(
prompt,
sampling_param=sampling_param,
image_path=str(image_path),
num_cond_frames=1,
output_path="outputs_video/cosmos2_5_i2w.mp4",
save_video=True,
)
generator.shutdown()
if __name__ == "__main__":
main()
basic_cosmos2_5_t2w.py
from fastvideo import VideoGenerator
from fastvideo.configs.sample import SamplingParam
def main():
# Point this to your local diffusers model dir (or replace with a HF model ID).
model_path = "KyleShao/Cosmos-Predict2.5-2B-Diffusers"
generator = VideoGenerator.from_pretrained(
model_path,
num_gpus=1,
use_fsdp_inference=False, # set True if GPU is out of memory
dit_cpu_offload=False,
vae_cpu_offload=False,
text_encoder_cpu_offload=True,
pin_cpu_memory=True,
)
# Load default sampling parameters (negative_prompt, resolution, steps, etc.)
sampling_param = SamplingParam.from_pretrained(model_path)
prompt = (
"A high-definition video captures the precision of robotic welding in an industrial setting. "
"The first frame showcases a robotic arm, equipped with a welding torch, positioned over a large metal structure. "
"The welding process is in full swing, with bright sparks and intense light illuminating the scene, "
"creating a vivid display of blue and white hues. "
"A significant amount of smoke billows around the welding area, partially obscuring the view but emphasizing the heat and activity. "
"The background reveals parts of the workshop environment, including a ventilation system and various pieces of machinery, "
"indicating a busy and functional industrial workspace. "
"As the video progresses, the robotic arm maintains its steady position, continuing the welding process and moving to its left. "
"The welding torch consistently emits sparks and light, and the smoke continues to rise, diffusing slightly as it moves upward. "
"The metal surface beneath the torch shows ongoing signs of heating and melting. "
"The scene retains its industrial ambiance, with the welding sparks and smoke dominating the visual field, "
"underscoring the ongoing nature of the welding operation."
)
generator.generate_video(
prompt,
sampling_param=sampling_param,
output_path="outputs_video/cosmos2_5_t2w.mp4",
save_video=True,
)
generator.shutdown()
if __name__ == "__main__":
main()
basic_cosmos2_5_v2w.py
# SPDX-License-Identifier: Apache-2.0
from fastvideo import VideoGenerator
from fastvideo.configs.sample import SamplingParam
def main():
# Point this to your local diffusers model dir (or replace with a HF model ID).
model_path = "KyleShao/Cosmos-Predict2.5-2B-Diffusers"
generator = VideoGenerator.from_pretrained(
model_path,
num_gpus=1,
use_fsdp_inference=False, # set True if GPU is out of memory
dit_cpu_offload=False,
vae_cpu_offload=False,
text_encoder_cpu_offload=True,
pin_cpu_memory=True,
)
sampling_param = SamplingParam.from_pretrained(model_path)
# video2world example from official repo
video_path = "assets/videos/robot_pouring.mp4"
prompt = (
"A robotic arm, primarily white with black joints and cables, is shown in a clean, modern indoor setting with a white tabletop. "
"The arm, equipped with a gripper holding a small, light green pitcher, is positioned above a clear glass containing a reddish-brown liquid and a spoon. "
"The robotic arm is in the process of pouring a transparent liquid into the glass. "
"To the left of the pitcher, there is an opened jar with a similar reddish-brown substance visible through its transparent body. "
"In the background, a vase with white flowers and a brown couch are partially visible, adding to the contemporary ambiance. "
"The lighting is bright, casting soft shadows on the table. "
"The robotic arm's movements are smooth and controlled, demonstrating precision in its task. "
"As the video progresses, the robotic arm completes the pour, leaving the glass half-filled with the reddish-brown liquid. "
"The jar remains untouched throughout the sequence, and the spoon inside the glass remains stationary. "
"The other robotic arm on the right side also stays stationary throughout the video. "
"The final frame captures the robotic arm with the pitcher finishing the pour, with the glass now filled to a higher level, while the pitcher is slightly tilted but still held securely by the gripper."
)
generator.generate_video(
prompt,
sampling_param=sampling_param,
video_path=str(video_path),
num_cond_frames=1,
output_path="outputs_video/cosmos2_5_v2w.mp4",
save_video=True,
)
generator.shutdown()
if __name__ == "__main__":
main()
basic_dmd.py
import os
import time
from fastvideo import VideoGenerator
from fastvideo.configs.sample import SamplingParam
OUTPUT_PATH = "video_samples_dmd2"
def main():
os.environ["FASTVIDEO_ATTENTION_BACKEND"] = "VIDEO_SPARSE_ATTN"
load_start_time = time.perf_counter()
model_name = "FastVideo/FastWan2.1-T2V-1.3B-Diffusers"
generator = VideoGenerator.from_pretrained(
model_name,
# FastVideo will automatically handle distributed setup
num_gpus=1,
use_fsdp_inference=False, # set to True if GPU is out of memory
# Adjust these offload parameters if you have < 32GB of VRAM
text_encoder_cpu_offload=True,
pin_cpu_memory=True, # set to false if low CPU RAM or hit obscure "CUDA error: Invalid argument"
dit_cpu_offload=False,
vae_cpu_offload=False,
VSA_sparsity=0.8,
)
load_end_time = time.perf_counter()
load_time = load_end_time - load_start_time
sampling_param = SamplingParam.from_pretrained(model_name)
prompt = (
"A neon-lit alley in futuristic Tokyo during a heavy rainstorm at night. The puddles reflect glowing signs in kanji, advertising ramen, karaoke, and VR arcades. A woman in a translucent raincoat walks briskly with an LED umbrella. Steam rises from a street food cart, and a cat darts across the screen. Raindrops are visible on the camera lens, creating a cinematic bokeh effect."
)
start_time = time.perf_counter()
video = generator.generate_video(prompt, output_path=OUTPUT_PATH, save_video=True, sampling_param=sampling_param)
end_time = time.perf_counter()
gen_time = end_time - start_time
# Generate another video with a different prompt, without reloading the
# model!
prompt2 = (
"A majestic lion strides across the golden savanna, its powerful frame "
"glistening under the warm afternoon sun. The tall grass ripples gently in "
"the breeze, enhancing the lion's commanding presence. The tone is vibrant, "
"embodying the raw energy of the wild. Low angle, steady tracking shot, "
"cinematic.")
start_time = time.perf_counter()
video2 = generator.generate_video(prompt2, output_path=OUTPUT_PATH, save_video=True)
end_time = time.perf_counter()
gen_time2 = end_time - start_time
print(f"Time taken to load model: {load_time} seconds")
print(f"Time taken to generate video: {gen_time} seconds")
print(f"Time taken to generate video2: {gen_time2} seconds")
if __name__ == "__main__":
main()
basic_gamecraft.py
# SPDX-License-Identifier: Apache-2.0
"""
Basic inference script for HunyuanGameCraft video generation.
HunyuanGameCraft generates game-like videos with camera/action control.
It takes an optional image input and generates video with camera motion
based on simple action commands (forward, left, right, backward, rotations).
Available actions:
- forward (w): Move camera forward
- backward (s): Move camera backward
- left (a): Move camera left (strafe)
- right (d): Move camera right (strafe)
- left_rot: Rotate camera left (pan)
- right_rot: Rotate camera right (pan)
- up_rot: Rotate camera up (tilt)
- down_rot: Rotate camera down (tilt)
T2V vs I2V:
- Default: I2V (uses a default reference image). Set GAMECRAFT_I2V_IMAGE to a
URL or path to use a different image.
- T2V only (no reference image): run with GAMECRAFT_I2V_IMAGE= (empty).
"""
import os
import torch
from fastvideo import VideoGenerator
from fastvideo.models.camera import create_camera_trajectory
# Model configuration (use GAMECRAFT_MODEL_PATH for local weights)
MODEL_PATH = os.environ.get("GAMECRAFT_MODEL_PATH", "FastVideo/HunyuanGameCraft-Diffusers")
# Default prompts for demo
DEFAULT_PROMPTS = {
"village": "A charming medieval village with cobblestone streets, thatched-roof houses, and vibrant flower gardens under a bright blue sky.",
"temple": "A majestic ancient temple stands under a clear blue sky, its grandeur highlighted by towering Doric columns and intricate architectural details.",
"forest": "A lush green forest with tall trees, dappled sunlight filtering through the leaves, and a winding dirt path.",
"beach": "A tropical beach with crystal clear turquoise water, white sand, and palm trees swaying in the breeze.",
}
# I2V: default reference image (URL). Can override with a local path.
DEFAULT_I2V_IMAGE_URL = (
"https://huggingface.co/datasets/huggingface/documentation-images/"
"resolve/main/diffusers/astronaut.jpg"
)
DEFAULT_I2V_PROMPT = (
"An astronaut hatching from an egg, on the surface of the moon, "
"the darkness and depth of space realised in the background."
)
OUTPUT_PATH = "video_samples_gamecraft"
def main():
# Initialize generator
# FastVideo will automatically download weights from HuggingFace
generator = VideoGenerator.from_pretrained(
MODEL_PATH,
num_gpus=1,
use_fsdp_inference=True,
dit_cpu_offload=True,
vae_cpu_offload=True,
text_encoder_cpu_offload=True,
pin_cpu_memory=True,
)
# Video parameters
height = 704
width = 1280
num_frames = 33
action = "forward"
action_speed = 0.2
# Create camera trajectory (Plücker coordinates)
camera_states = create_camera_trajectory(
action=action,
height=height,
width=width,
num_frames=num_frames,
action_speed=action_speed,
dtype=torch.bfloat16,
)
print(f"Camera states shape: {camera_states.shape}")
# I2V vs T2V: unset GAMECRAFT_I2V_IMAGE -> I2V (default image). Set to "" -> T2V.
env_image = os.environ.get("GAMECRAFT_I2V_IMAGE")
if env_image is None:
image_path = DEFAULT_I2V_IMAGE_URL # default: I2V
elif env_image.strip() == "":
image_path = None # T2V
else:
image_path = env_image.strip() # I2V with given URL/path
is_i2v = image_path is not None
prompt = DEFAULT_I2V_PROMPT if is_i2v else DEFAULT_PROMPTS["temple"]
print(f"Mode: {'I2V' if is_i2v else 'T2V'}, prompt: {prompt[:60]}...")
gen_kw = dict(
prompt=prompt,
negative_prompt="",
camera_states=camera_states,
height=height,
width=width,
num_frames=num_frames,
num_inference_steps=50,
guidance_scale=6.0,
seed=42,
fps=24,
output_path=OUTPUT_PATH,
save_video=True,
)
if is_i2v:
gen_kw["image_path"] = image_path
generator.generate_video(**gen_kw)
if __name__ == "__main__":
main()
basic_hy15.py
from fastvideo import VideoGenerator
import json
# from fastvideo.configs.sample import SamplingParam
OUTPUT_PATH = "video_samples_hy15"
def main():
# FastVideo will automatically use the optimal default arguments for the
# model.
# If a local path is provided, FastVideo will make a best effort
# attempt to identify the optimal arguments.
generator = VideoGenerator.from_pretrained(
"hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-480p_t2v",
# FastVideo will automatically handle distributed setup
num_gpus=1,
use_fsdp_inference=False, # set to True if GPU is out of memory
dit_cpu_offload=True,
vae_cpu_offload=True,
text_encoder_cpu_offload=True,
pin_cpu_memory=True, # set to false if low CPU RAM or hit obscure "CUDA error: Invalid argument"
# image_encoder_cpu_offload=False,
)
prompt = (
"A curious raccoon peers through a vibrant field of yellow sunflowers, its eyes "
"wide with interest. The playful yet serene atmosphere is complemented by soft "
"natural light filtering through the petals. Mid-shot, warm and cheerful tones."
)
video = generator.generate_video(prompt, output_path=OUTPUT_PATH, save_video=True, negative_prompt="", num_frames=81, fps=16)
prompt2 = (
"A majestic lion strides across the golden savanna, its powerful frame "
"glistening under the warm afternoon sun. The tall grass ripples gently in "
"the breeze, enhancing the lion's commanding presence. The tone is vibrant, "
"embodying the raw energy of the wild. Low angle, steady tracking shot, "
"cinematic.")
video2 = generator.generate_video(prompt2, output_path=OUTPUT_PATH, save_video=True, negative_prompt="", num_frames=81, fps=16)
if __name__ == "__main__":
main()
basic_hy15_1080p.py
from fastvideo import VideoGenerator
import json
# from fastvideo.configs.sample import SamplingParam
OUTPUT_PATH = "video_samples_hy15_1080p"
def main():
# FastVideo will automatically use the optimal default arguments for the
# model.
# If a local path is provided, FastVideo will make a best effort
# attempt to identify the optimal arguments.
generator = VideoGenerator.from_pretrained(
"weizhou03/HunyuanVideo-1.5-Diffusers-1080p-2SR", # 480p -> 720p -> 1080p
# or "weizhou03/HunyuanVideo-1.5-Diffusers-1080p" # 720p -> 1080p
# FastVideo will automatically handle distributed setup
num_gpus=1,
use_fsdp_inference=False, # set to True if GPU is out of memory
dit_cpu_offload=True,
vae_cpu_offload=True,
text_encoder_cpu_offload=True,
pin_cpu_memory=True, # set to false if low CPU RAM or hit obscure "CUDA error: Invalid argument"
# image_encoder_cpu_offload=False,
)
prompt = (
"A curious raccoon peers through a vibrant field of yellow sunflowers, its eyes "
"wide with interest. The playful yet serene atmosphere is complemented by soft "
"natural light filtering through the petals. Mid-shot, warm and cheerful tones."
)
video = generator.generate_video(prompt, output_path=OUTPUT_PATH, save_video=True, negative_prompt="")
prompt2 = (
"A majestic lion strides across the golden savanna, its powerful frame "
"glistening under the warm afternoon sun. The tall grass ripples gently in "
"the breeze, enhancing the lion's commanding presence. The tone is vibrant, "
"embodying the raw energy of the wild. Low angle, steady tracking shot, "
"cinematic.")
video2 = generator.generate_video(prompt2, output_path=OUTPUT_PATH, save_video=True, negative_prompt="")
if __name__ == "__main__":
main()
basic_hyworld.py
from fastvideo import VideoGenerator
from fastvideo.models.dits.hyworld.resolution_utils import get_resolution_from_image
# Default prompt from HY-WorldPlay run.sh
DEFAULT_PROMPT = 'A paved pathway leads towards a stone arch bridge spanning a calm body of water. Lush green trees and foliage line the path and the far bank of the water. A traditional-style pavilion with a tiered, reddish-brown roof sits on the far shore. The water reflects the surrounding greenery and the sky. The scene is bathed in soft, natural light, creating a tranquil and serene atmosphere. The pathway is composed of large, rectangular stones, and the bridge is constructed of light gray stone. The overall composition emphasizes the peaceful and harmonious nature of the landscape.'
DEFAULT_IMAGE = 'https://raw.githubusercontent.com/Tencent-Hunyuan/HY-WorldPlay/main/assets/img/test.png'
OUTPUT_PATH = "video_samples_hyworld"
def main():
import argparse
# pose: (a, w, s, d) - (15, 31)
# num_frames: (61, 125)
parser = argparse.ArgumentParser(description="HYWorld video generation with FastVideo")
parser.add_argument("--prompt", type=str, default=DEFAULT_PROMPT, help="Text prompt for video generation")
parser.add_argument("--image", type=str, default=DEFAULT_IMAGE, help="Path or URL to input image")
parser.add_argument("--pose", type=str, default='w-31', help="Pose string (e.g., 'a-31', 'w-31', 's-31', 'd-31')")
parser.add_argument("--output_path", type=str, default=OUTPUT_PATH, help="Output video path")
parser.add_argument("--num-frames", type=int, default=125, help="Number of frames")
parser.add_argument("--seed", type=int, default=1, help="Random seed")
parser.add_argument("--resolution", type=str, default="480p", help="Only support 480p for now")
args = parser.parse_args()
# Automatically determine resolution from input image
HEIGHT, WIDTH = get_resolution_from_image(args.image, args.resolution)
print(f"Image: {args.image}")
print(f"Pose: {args.pose}")
print(f"Resolution: {HEIGHT}x{WIDTH} (from {args.resolution} buckets)")
print(f"Num frames: {args.num_frames}")
print(f"Output path: {args.output_path}")
# Initialize generator
print("\nInitializing VideoGenerator for HYWorld...")
generator = VideoGenerator.from_pretrained(
"FastVideo/HY-WorldPlay-Bidirectional-Diffusers",
num_gpus=1,
use_fsdp_inference=True,
dit_cpu_offload=True,
vae_cpu_offload=True,
text_encoder_cpu_offload=True,
pin_cpu_memory=True,
image_encoder_cpu_offload=True,
)
# Generate video
# The pose string is automatically converted to camera matrices by the pipeline
print("\nGenerating video...")
generator.generate_video(
prompt=args.prompt,
image_path=args.image,
pose=args.pose, # Camera trajectory control
output_path=args.output_path,
save_video=True,
negative_prompt="",
num_frames=args.num_frames,
fps=24,
height=HEIGHT,
width=WIDTH,
seed=args.seed,
)
print(f"\nVideo saved to: {args.output_path}")
if __name__ == "__main__":
main()
basic_lingbotworld_base_cam.py
from fastvideo import VideoGenerator
from fastvideo.models.dits.lingbotworld.cam_utils import prepare_camera_embedding
# from fastvideo.configs.sample import SamplingParam
OUTPUT_PATH = "video_samples_lingbotworld"
def main():
# FastVideo will automatically use the optimal default arguments for the
# model.
# If a local path is provided, FastVideo will make a best effort
# attempt to identify the optimal arguments.
generator = VideoGenerator.from_pretrained(
"FastVideo/LingBot-World-Base-Cam-Diffusers",
# FastVideo will automatically handle distributed setup
num_gpus=1,
use_fsdp_inference=False, # set to True if GPU is out of memory
dit_cpu_offload=True, # DiT need to be offloaded for MoE
vae_cpu_offload=False,
text_encoder_cpu_offload=True,
# Set pin_cpu_memory to false if CPU RAM is limited and there're no frequent CPU-GPU transfer
pin_cpu_memory=True,
# image_encoder_cpu_offload=False,
)
num_frames = 81
prompt = "The video presents a soaring journey through a fantasy jungle. The wind whips past the rider's blue hands gripping the reins, causing the leather straps to vibrate. The ancient gothic castle approaches steadily, its stone details becoming clearer against the backdrop of floating islands and distant waterfalls."
image_path = "https://raw.githubusercontent.com/Robbyant/lingbot-world/main/examples/00/image.jpg"
action_path = "examples/inference/basic/lingbotworld_examples/00"
c2ws_plucker_emb, num_frames = prepare_camera_embedding(
action_path=action_path,
num_frames=num_frames,
height=480,
width=832,
spatial_scale=8,
)
generator.generate_video(
prompt,
image_path=image_path,
output_path=OUTPUT_PATH,
save_video=True,
num_frames=num_frames,
height=480,
width=832,
c2ws_plucker_emb=c2ws_plucker_emb,
)
if __name__ == "__main__":
main()
basic_longcat_i2v.py
"""
LongCat Image-to-Video (I2V) Example Script
This script demonstrates LongCat I2V inference using the FastVideo Python API.
LongCat I2V takes an input image and generates a video from it.
It runs both basic generation (50 steps) and distill+refine generation
(16 steps distill + 50 steps refinement to 720p with BSA).
Usage:
python examples/inference/basic/basic_longcat_i2v.py
Note:
Refinement uses 768x768 dimensions where latent (48x48) is divisible by 8,
compatible with BSA chunks [4, 4, 8].
"""
import glob
import os
from fastvideo import VideoGenerator
# Common prompts and settings matching the shell script examples
PROMPT = (
"A woman sits at a wooden table by the window in a cozy café. She reaches out "
"with her right hand, picks up the white coffee cup from the saucer, and gently "
"brings it to her lips to take a sip. After drinking, she places the cup back on "
"the table and looks out the window, enjoying the peaceful atmosphere."
)
NEGATIVE_PROMPT = (
"Bright tones, overexposed, static, blurred details, subtitles, style, works, "
"paintings, images, static, overall gray, worst quality, low quality, JPEG compression "
"residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, "
"deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, "
"three legs, many people in the background, walking backwards"
)
# Input image path
IMAGE_PATH = "assets/girl.png"
SEED = 42
def basic_generation():
"""
Run basic LongCat I2V generation (50 steps at 480p).
This uses the full 50-step denoising process for highest quality.
"""
print("=" * 60)
print("LongCat I2V: Basic Generation (50 steps, 480p)")
print("=" * 60)
generator = VideoGenerator.from_pretrained(
"FastVideo/LongCat-Video-I2V-Diffusers",
num_gpus=1,
use_fsdp_inference=False, # set to True if GPU is out of memory
dit_cpu_offload=False,
vae_cpu_offload=True,
text_encoder_cpu_offload=True,
pin_cpu_memory=False,
enable_bsa=False,
)
output_path = "outputs_video/longcat_i2v_basic"
generator.generate_video(
prompt=PROMPT,
negative_prompt=NEGATIVE_PROMPT,
image_path=IMAGE_PATH,
output_path=output_path,
save_video=True,
height=480,
width=480, # Square
num_frames=93,
num_inference_steps=50,
fps=15,
guidance_scale=4.0,
seed=SEED,
)
print(f"\nBasic generation complete! Video saved to: {output_path}")
generator.shutdown()
def distill_refine_generation():
"""
Run LongCat I2V with distill+refine pipeline (16 steps + refinement to 768p).
This uses the distilled LoRA for fast 480p generation (16 steps),
then refines to 768p using the refinement LoRA with BSA enabled.
"""
print("\n" + "=" * 60)
print("LongCat I2V: Distill + Refine Pipeline")
print("=" * 60)
# Stage 1: Distilled generation (16 steps at 480p)
print("\n[Stage 1] Distilled generation (16 steps, 480p)")
print("-" * 40)
generator = VideoGenerator.from_pretrained(
"FastVideo/LongCat-Video-I2V-Diffusers",
num_gpus=1,
use_fsdp_inference=True,
dit_cpu_offload=False,
vae_cpu_offload=True,
text_encoder_cpu_offload=True,
pin_cpu_memory=False,
enable_bsa=False,
lora_path="FastVideo/LongCat-Video-T2V-Distilled-LoRA",
lora_nickname="distilled",
)
distill_output_path = "outputs_video/longcat_i2v_distill"
generator.generate_video(
prompt=PROMPT,
negative_prompt=NEGATIVE_PROMPT,
image_path=IMAGE_PATH,
output_path=distill_output_path,
save_video=True,
height=480,
width=480, # Square
num_frames=93,
num_inference_steps=16,
fps=15,
guidance_scale=1.0,
seed=SEED,
)
print(f"Distilled generation complete! Video saved to: {distill_output_path}")
generator.shutdown()
# Stage 2: Refinement (480p -> 768p)
print("\n[Stage 2] Refinement (480p -> 768p with BSA)")
print("-" * 40)
# Find the actual saved video file from stage 1
video_files = glob.glob(os.path.join(distill_output_path, "*.mp4"))
if not video_files:
raise FileNotFoundError(f"No video file found in {distill_output_path}")
# Use the most recently created video file
distill_video_path = max(video_files, key=os.path.getmtime)
print(f"Using stage 1 video: {distill_video_path}")
# Create a new generator with refinement LoRA and BSA enabled
# Note: Refinement uses the T2V model (not I2V) since it's upscaling the generated video
# For BSA [4, 4, 8]: latent must be divisible by 8
# 768x768: latent 48x48, 48%8=0 ✓
refine_generator = VideoGenerator.from_pretrained(
"FastVideo/LongCat-Video-T2V-Diffusers",
num_gpus=1,
use_fsdp_inference=True,
dit_cpu_offload=True,
vae_cpu_offload=True,
text_encoder_cpu_offload=True,
pin_cpu_memory=False,
enable_bsa=True,
bsa_sparsity=0.875,
bsa_chunk_q=[4, 4, 4],
bsa_chunk_k=[4, 4, 4],
lora_path="FastVideo/LongCat-Video-T2V-Refinement-LoRA",
lora_nickname="refinement",
)
refine_output_path = "outputs_video/longcat_i2v_refine_720p"
refine_generator.generate_video(
prompt=PROMPT,
negative_prompt=NEGATIVE_PROMPT,
output_path=refine_output_path,
save_video=True,
refine_from=distill_video_path,
t_thresh=0.5,
spatial_refine_only=False,
num_cond_frames=0,
height=720,
width=720,
num_inference_steps=50,
fps=30,
guidance_scale=1.0,
seed=SEED,
)
print(f"Refinement complete! Video saved to: {refine_output_path}")
refine_generator.shutdown()
def main():
"""Run both basic and distill+refine generation pipelines."""
print("\n" + "=" * 60)
print("LongCat Image-to-Video Example")
print("=" * 60 + "\n")
# Run basic generation
basic_generation()
# Run distill+refine pipeline
distill_refine_generation()
print("\n" + "=" * 60)
print("All generations complete!")
print("=" * 60)
if __name__ == "__main__":
main()
basic_longcat_t2v.py
"""
LongCat Text-to-Video (T2V) Example Script
This script demonstrates LongCat T2V inference using the FastVideo Python API.
It runs both basic generation (50 steps) and distill+refine generation
(16 steps distill + 50 steps refinement to 720p).
Usage:
python examples/inference/basic/basic_longcat_t2v.py
"""
import glob
import os
from fastvideo import VideoGenerator
# Common prompts and settings matching the shell script examples
PROMPT = (
"In a realistic photography style, a white boy around seven or eight years old "
"sits on a park bench, wearing a light blue T-shirt, denim shorts, and white sneakers. "
"He holds an ice cream cone with vanilla and chocolate flavors, and beside him is a "
"medium-sized golden Labrador. Smiling, the boy offers the ice cream to the dog, "
"who eagerly licks it with its tongue. The sun is shining brightly, and the background "
"features a green lawn and several tall trees, creating a warm and loving scene."
)
NEGATIVE_PROMPT = (
"Bright tones, overexposed, static, blurred details, subtitles, style, works, "
"paintings, images, static, overall gray, worst quality, low quality, JPEG compression "
"residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, "
"deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, "
"three legs, many people in the background, walking backwards"
)
SEED = 42
def basic_generation():
"""
Run basic LongCat T2V generation (50 steps at 480p).
This uses the full 50-step denoising process for highest quality.
"""
print("=" * 60)
print("LongCat T2V: Basic Generation (50 steps, 480p)")
print("=" * 60)
generator = VideoGenerator.from_pretrained(
"FastVideo/LongCat-Video-T2V-Diffusers",
num_gpus=1,
use_fsdp_inference=False, # set to True if GPU is out of memory
dit_cpu_offload=False,
vae_cpu_offload=True,
text_encoder_cpu_offload=True,
pin_cpu_memory=False,
enable_bsa=False,
)
output_path = "outputs_video/longcat_t2v_basic"
generator.generate_video(
prompt=PROMPT,
negative_prompt=NEGATIVE_PROMPT,
output_path=output_path,
save_video=True,
height=480,
width=832,
num_frames=93,
num_inference_steps=50,
fps=15,
guidance_scale=4.0,
seed=SEED,
)
print(f"\nBasic generation complete! Video saved to: {output_path}")
generator.shutdown()
def distill_refine_generation():
"""
Run LongCat T2V with distill+refine pipeline (16 steps + refinement to 720p).
This uses the distilled LoRA for fast 480p generation (16 steps),
then refines to 720p using the refinement LoRA with BSA enabled.
"""
print("\n" + "=" * 60)
print("LongCat T2V: Distill + Refine Pipeline")
print("=" * 60)
# Stage 1: Distilled generation (16 steps at 480p)
print("\n[Stage 1] Distilled generation (16 steps, 480p)")
print("-" * 40)
generator = VideoGenerator.from_pretrained(
"FastVideo/LongCat-Video-T2V-Diffusers",
num_gpus=1,
use_fsdp_inference=True,
dit_cpu_offload=False,
vae_cpu_offload=True,
text_encoder_cpu_offload=True,
pin_cpu_memory=False,
enable_bsa=False,
lora_path="FastVideo/LongCat-Video-T2V-Distilled-LoRA",
lora_nickname="distilled",
)
distill_output_path = "outputs_video/longcat_t2v_distill"
generator.generate_video(
prompt=PROMPT,
negative_prompt=NEGATIVE_PROMPT,
output_path=distill_output_path,
save_video=True,
height=480,
width=832,
num_frames=93,
num_inference_steps=16,
fps=15,
guidance_scale=1.0,
seed=SEED,
)
print(f"Distilled generation complete! Video saved to: {distill_output_path}")
generator.shutdown()
# Stage 2: Refinement (480p -> 720p)
print("\n[Stage 2] Refinement (480p -> 720p with BSA)")
print("-" * 40)
# Find the actual saved video file from stage 1
video_files = glob.glob(os.path.join(distill_output_path, "*.mp4"))
if not video_files:
raise FileNotFoundError(f"No video file found in {distill_output_path}")
# Use the most recently created video file
distill_video_path = max(video_files, key=os.path.getmtime)
print(f"Using stage 1 video: {distill_video_path}")
# Create a new generator with refinement LoRA and BSA enabled
refine_generator = VideoGenerator.from_pretrained(
"FastVideo/LongCat-Video-T2V-Diffusers",
num_gpus=1,
use_fsdp_inference=True,
dit_cpu_offload=True,
vae_cpu_offload=True,
text_encoder_cpu_offload=True,
pin_cpu_memory=False,
enable_bsa=True,
bsa_sparsity=0.875,
bsa_chunk_q=[4, 4, 8],
bsa_chunk_k=[4, 4, 8],
lora_path="FastVideo/LongCat-Video-T2V-Refinement-LoRA",
lora_nickname="refinement",
)
refine_output_path = "outputs_video/longcat_t2v_refine_720p"
refine_generator.generate_video(
prompt=PROMPT,
negative_prompt=NEGATIVE_PROMPT,
output_path=refine_output_path,
save_video=True,
refine_from=distill_video_path,
t_thresh=0.5,
spatial_refine_only=False,
num_cond_frames=0,
height=720,
width=1280,
num_inference_steps=50,
fps=30,
guidance_scale=1.0,
seed=SEED,
)
print(f"Refinement complete! Video saved to: {refine_output_path}")
refine_generator.shutdown()
def main():
"""Run both basic and distill+refine generation pipelines."""
print("\n" + "=" * 60)
print("LongCat Text-to-Video Example")
print("=" * 60 + "\n")
# Run basic generation
basic_generation()
# Run distill+refine pipeline
distill_refine_generation()
print("\n" + "=" * 60)
print("All generations complete!")
print("=" * 60)
if __name__ == "__main__":
main()
basic_longcat_vc.py
"""
LongCat Video Continuation (VC) Example Script
This script demonstrates LongCat VC inference using the FastVideo Python API.
LongCat VC takes an input video and generates a continuation of it.
It runs both basic generation (50 steps) and distill+refine generation
(16 steps distill + 50 steps refinement to 720p).
Usage:
python examples/inference/basic/basic_longcat_vc.py
Prerequisites:
- Ensure the input video exists at assets/motorcycle.mp4
(or provide your own video)
"""
import glob
import os
from fastvideo import VideoGenerator
# Common prompts and settings matching the shell script examples
PROMPT = (
"A person rides a motorcycle along a long, straight road that stretches between "
"a body of water and a forested hillside. The rider steadily accelerates, keeping "
"the motorcycle centered between the guardrails, while the scenery passes by on "
"both sides. The video captures the journey from the rider's perspective, emphasizing "
"the sense of motion and adventure."
)
NEGATIVE_PROMPT = (
"Bright tones, overexposed, static, blurred details, subtitles, style, works, "
"paintings, images, static, overall gray, worst quality, low quality, JPEG compression "
"residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, "
"deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, "
"three legs, many people in the background, walking backwards"
)
# Input video path
VIDEO_PATH = "assets/motorcycle.mp4"
# Number of conditioning frames from the input video
NUM_COND_FRAMES = 13
SEED = 42
def basic_generation():
"""
Run basic LongCat VC generation (50 steps at 480p).
This uses the full 50-step denoising process for highest quality.
"""
print("=" * 60)
print("LongCat VC: Basic Generation (50 steps, 480p)")
print("=" * 60)
# Check if video exists
if not os.path.exists(VIDEO_PATH):
raise FileNotFoundError(
f"Video not found at {VIDEO_PATH}. "
"Please provide a valid video path."
)
generator = VideoGenerator.from_pretrained(
"FastVideo/LongCat-Video-VC-Diffusers",
num_gpus=1,
use_fsdp_inference=False, # set to True if GPU is out of memory
dit_cpu_offload=False,
vae_cpu_offload=True,
text_encoder_cpu_offload=True,
pin_cpu_memory=False,
enable_bsa=False,
)
output_path = "outputs_video/longcat_vc_basic"
generator.generate_video(
prompt=PROMPT,
negative_prompt=NEGATIVE_PROMPT,
video_path=VIDEO_PATH,
num_cond_frames=NUM_COND_FRAMES,
output_path=output_path,
save_video=True,
height=480,
width=832,
num_frames=93,
num_inference_steps=50,
fps=15,
guidance_scale=4.0,
seed=SEED,
)
print(f"\nBasic generation complete! Video saved to: {output_path}")
generator.shutdown()
def distill_refine_generation():
"""
Run LongCat VC with distill+refine pipeline (16 steps + refinement to 720p).
This uses the distilled LoRA for fast 480p generation (16 steps),
then refines to 720p using the refinement LoRA with BSA enabled.
"""
print("\n" + "=" * 60)
print("LongCat VC: Distill + Refine Pipeline")
print("=" * 60)
# Check if video exists
if not os.path.exists(VIDEO_PATH):
raise FileNotFoundError(
f"Video not found at {VIDEO_PATH}. "
"Please provide a valid video path."
)
# Stage 1: Distilled generation (16 steps at 480p)
print("\n[Stage 1] Distilled generation (16 steps, 480p)")
print("-" * 40)
generator = VideoGenerator.from_pretrained(
"FastVideo/LongCat-Video-VC-Diffusers",
num_gpus=1,
use_fsdp_inference=True,
dit_cpu_offload=False,
vae_cpu_offload=True,
text_encoder_cpu_offload=True,
pin_cpu_memory=False,
enable_bsa=False,
lora_path="FastVideo/LongCat-Video-T2V-Distilled-LoRA",
lora_nickname="distilled",
)
distill_output_path = "outputs_video/longcat_vc_distill"
generator.generate_video(
prompt=PROMPT,
negative_prompt=NEGATIVE_PROMPT,
video_path=VIDEO_PATH,
num_cond_frames=NUM_COND_FRAMES,
output_path=distill_output_path,
save_video=True,
height=480,
width=832,
num_frames=93,
num_inference_steps=16,
fps=15,
guidance_scale=1.0,
seed=SEED,
)
print(f"Distilled generation complete! Video saved to: {distill_output_path}")
generator.shutdown()
# Stage 2: Refinement (480p -> 720p)
print("\n[Stage 2] Refinement (480p -> 720p with BSA)")
print("-" * 40)
# Find the actual saved video file from stage 1
video_files = glob.glob(os.path.join(distill_output_path, "*.mp4"))
if not video_files:
raise FileNotFoundError(f"No video file found in {distill_output_path}")
# Use the most recently created video file
distill_video_path = max(video_files, key=os.path.getmtime)
print(f"Using stage 1 video: {distill_video_path}")
# Create a new generator with refinement LoRA and BSA enabled
# Note: Refinement uses the T2V model (not VC) since it's upscaling the generated video
refine_generator = VideoGenerator.from_pretrained(
"FastVideo/LongCat-Video-T2V-Diffusers",
num_gpus=1,
use_fsdp_inference=True,
dit_cpu_offload=True,
vae_cpu_offload=True,
text_encoder_cpu_offload=True,
pin_cpu_memory=False,
enable_bsa=True,
bsa_sparsity=0.875,
bsa_chunk_q=[4, 4, 8],
bsa_chunk_k=[4, 4, 8],
lora_path="FastVideo/LongCat-Video-T2V-Refinement-LoRA",
lora_nickname="refinement",
)
refine_output_path = "outputs_video/longcat_vc_refine_720p"
refine_generator.generate_video(
prompt=PROMPT,
negative_prompt=NEGATIVE_PROMPT,
output_path=refine_output_path,
save_video=True,
refine_from=distill_video_path,
t_thresh=0.5,
spatial_refine_only=False,
num_cond_frames=0, # For refinement, no conditioning frames
height=720,
width=1280,
num_inference_steps=50,
fps=30,
guidance_scale=1.0,
seed=SEED,
)
print(f"Refinement complete! Video saved to: {refine_output_path}")
refine_generator.shutdown()
def main():
"""Run both basic and distill+refine generation pipelines."""
print("\n" + "=" * 60)
print("LongCat Video Continuation Example")
print("=" * 60 + "\n")
# Run basic generation
basic_generation()
# Run distill+refine pipeline
distill_refine_generation()
print("\n" + "=" * 60)
print("All generations complete!")
print("=" * 60)
if __name__ == "__main__":
main()
basic_ltx2.py
from fastvideo import VideoGenerator
PROMPT = (
"A warm sunny backyard. The camera starts in a tight cinematic close-up "
"of a woman and a man in their 30s, facing each other with serious "
"expressions. The woman, emotional and dramatic, says softly, \"That's "
"it... Dad's lost it. And we've lost Dad.\" The man exhales, slightly "
"annoyed: \"Stop being so dramatic, Jess.\" A beat. He glances aside, "
"then mutters defensively, \"He's just having fun.\" The camera slowly "
"pans right, revealing the grandfather in the garden wearing enormous "
"butterfly wings, waving his arms in the air like he's trying to take "
"off. He shouts, \"Wheeeew!\" as he flaps his wings with full commitment. "
"The woman covers her face, on the verge of tears. The tone is deadpan, "
"absurd, and quietly tragic."
)
def main() -> None:
# Uses FastVideo default sampling settings for LTX2 base.
generator = VideoGenerator.from_pretrained(
"Davids048/LTX2-Base-Diffusers",
num_gpus=1,
)
output_path = "outputs_video/ltx2_basic/output_ltx2_base_t2v_1088_1920_1.1.mp4"
generator.generate_video(
prompt=PROMPT,
output_path=output_path,
save_video=True,
num_frames=121,
height=1088,
width=1920,
)
generator.shutdown()
if __name__ == "__main__":
main()
basic_ltx2_distilled.py
from fastvideo import VideoGenerator
PROMPT = (
"A warm sunny backyard. The camera starts in a tight cinematic close-up "
"of a woman and a man in their 30s, facing each other with serious "
"expressions. The woman, emotional and dramatic, says softly, \"That's "
"it... Dad's lost it. And we've lost Dad.\" The man exhales, slightly "
"annoyed: \"Stop being so dramatic, Jess.\" A beat. He glances aside, "
"then mutters defensively, \"He's just having fun.\" The camera slowly "
"pans right, revealing the grandfather in the garden wearing enormous "
"butterfly wings, waving his arms in the air like he's trying to take "
"off. He shouts, \"Wheeeew!\" as he flaps his wings with full commitment. "
"The woman covers her face, on the verge of tears. The tone is deadpan, "
"absurd, and quietly tragic."
)
import os
os.environ["FASTVIDEO_ATTENTION_BACKEND"] = "FLASH_ATTN"
def main() -> None:
generator = VideoGenerator.from_pretrained(
"FastVideo/LTX2-Distilled-Diffusers",
num_gpus=4,
)
output_path = "outputs_video/ltx2_basic/output_ltx2_distilled_t2v.mp4"
generator.generate_video(
prompt=PROMPT,
output_path=output_path,
save_video=True,
)
generator.shutdown()
if __name__ == "__main__":
main()
basic_matrixgame.py
from fastvideo import VideoGenerator
from fastvideo.models.dits.matrixgame.utils import create_action_presets
import torch
# Available variants: "base_distilled_model", "gta_distilled_model", "templerun_distilled_model"
# Each variant has different keyboard_dim:
# - base_distilled_model: keyboard_dim=4
# - gta_distilled_model: keyboard_dim=2
# - templerun_distilled_model: keyboard_dim=7 (keyboard only, no mouse)
MODEL_VARIANT = "base_distilled_model"
# Variant-specific settings
VARIANT_CONFIG = {
"base_distilled_model": {
"model_path": "FastVideo/Matrix-Game-2.0-Base-Diffusers",
"keyboard_dim": 4,
"image_url": "https://raw.githubusercontent.com/SkyworkAI/Matrix-Game/main/Matrix-Game-2/demo_images/universal/0000.png",
},
"gta_distilled_model": {
"model_path": "FastVideo/Matrix-Game-2.0-GTA-Diffusers",
"keyboard_dim": 2,
"image_url": "https://raw.githubusercontent.com/SkyworkAI/Matrix-Game/main/Matrix-Game-2/demo_images/gta_drive/0000.png",
},
"templerun_distilled_model": {
"model_path": "FastVideo/Matrix-Game-2.0-TempleRun-Diffusers",
"keyboard_dim": 7,
"image_url": "https://raw.githubusercontent.com/SkyworkAI/Matrix-Game/main/Matrix-Game-2/demo_images/temple_run/0000.png",
},
}
OUTPUT_PATH = "video_samples_matrixgame2"
def main():
# FastVideo will automatically use the optimal default arguments for the
# model.
# If a local path is provided, FastVideo will make a best effort
# attempt to identify the optimal arguments.
config = VARIANT_CONFIG[MODEL_VARIANT]
generator = VideoGenerator.from_pretrained(
config["model_path"],
# FastVideo will automatically handle distributed setup
num_gpus=1,
use_fsdp_inference=False, # set to True if GPU is out of memory
dit_cpu_offload=True, # DiT need to be offloaded for MoE
vae_cpu_offload=False,
text_encoder_cpu_offload=True,
# Set pin_cpu_memory to false if CPU RAM is limited and there're no frequent CPU-GPU transfer
pin_cpu_memory=True,
# image_encoder_cpu_offload=False,
)
num_frames = 597
actions = create_action_presets(num_frames, keyboard_dim=config["keyboard_dim"])
grid_sizes = torch.tensor([150, 44, 80])
generator.generate_video(
prompt="",
image_path=config["image_url"],
mouse_cond=actions["mouse"].unsqueeze(0),
keyboard_cond=actions["keyboard"].unsqueeze(0),
grid_sizes=grid_sizes,
num_frames=num_frames,
height=352,
width=640,
num_inference_steps=50,
output_path=OUTPUT_PATH,
save_video=True,
)
if __name__ == "__main__":
main()
basic_matrixgame_streaming.py
from fastvideo.entrypoints.streaming_generator import StreamingVideoGenerator
from fastvideo.models.dits.matrixgame.utils import get_current_action_async, expand_action_to_frames
import torch
import asyncio
# Available variants: "base_distilled_model", "gta_distilled_model", "templerun_distilled_model"
# Each variant has different keyboard_dim:
# - base_distilled_model: keyboard_dim=4
# - gta_distilled_model: keyboard_dim=2
# - templerun_distilled_model: keyboard_dim=7 (keyboard only, no mouse)
MODEL_VARIANT = "base_distilled_model"
# Variant-specific settings
VARIANT_CONFIG = {
"base_distilled_model": {
"model_path": "FastVideo/Matrix-Game-2.0-Base-Diffusers",
"keyboard_dim": 4,
"mode": "universal",
"image_url": "https://raw.githubusercontent.com/SkyworkAI/Matrix-Game/main/Matrix-Game-2/demo_images/universal/0000.png",
},
"gta_distilled_model": {
"model_path": "FastVideo/Matrix-Game-2.0-GTA-Diffusers",
"keyboard_dim": 2,
"mode": "gta_drive",
"image_url": "https://raw.githubusercontent.com/SkyworkAI/Matrix-Game/main/Matrix-Game-2/demo_images/gta_drive/0000.png",
},
"templerun_distilled_model": {
"model_path": "FastVideo/Matrix-Game-2.0-TempleRun-Diffusers",
"keyboard_dim": 7,
"mode": "templerun",
"image_url": "https://raw.githubusercontent.com/SkyworkAI/Matrix-Game/main/Matrix-Game-2/demo_images/temple_run/0000.png",
},
}
OUTPUT_PATH = "video_samples_matrixgame2"
async def main():
# FastVideo will automatically use the optimal default arguments for the
# model.
# If a local path is provided, FastVideo will make a best effort
# attempt to identify the optimal arguments.
config = VARIANT_CONFIG[MODEL_VARIANT]
generator = StreamingVideoGenerator.from_pretrained(
config["model_path"],
# FastVideo will automatically handle distributed setup
num_gpus=1,
use_fsdp_inference=False, # set to True if GPU is out of memory
dit_cpu_offload=True, # DiT need to be offloaded for MoE
vae_cpu_offload=False,
text_encoder_cpu_offload=True,
# Set pin_cpu_memory to false if CPU RAM is limited and there're no frequent CPU-GPU transfer
pin_cpu_memory=True,
# image_encoder_cpu_offload=False,
)
max_blocks = 50
num_frames = 597
actions = {
"keyboard": torch.zeros((num_frames, config["keyboard_dim"])),
"mouse": torch.zeros((num_frames, 2))
}
grid_sizes = torch.tensor([150, 44, 80])
mode = config["mode"]
generator.reset(
prompt="",
image_path=config["image_url"],
mouse_cond=actions["mouse"].unsqueeze(0),
keyboard_cond=actions["keyboard"].unsqueeze(0),
grid_sizes=grid_sizes,
num_frames=num_frames,
height=352,
width=640,
num_inference_steps=50,
output_path=OUTPUT_PATH,
save_video=True,
)
print("Initialization complete.")
for block_id in range(max_blocks):
print(f"\n=== Block {block_id + 1}/{max_blocks} ===")
action = await get_current_action_async(mode)
keyboard_cond, mouse_cond = expand_action_to_frames(action, 12)
await generator.step_async(keyboard_cond, mouse_cond)
if (await asyncio.to_thread(input, "\nContinue? (y/n): ")).lower() == 'n':
break
# Save final video
generator.finalize()
generator.shutdown()
if __name__ == "__main__":
asyncio.run(main())
basic_mps.py
from fastvideo import VideoGenerator, PipelineConfig
from fastvideo.configs.sample import SamplingParam
def main():
config = PipelineConfig.from_pretrained("Wan-AI/Wan2.1-T2V-1.3B-Diffusers")
config.text_encoder_precisions = ["fp16"]
generator = VideoGenerator.from_pretrained(
"Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
pipeline_config=config,
use_fsdp_inference=False, # Disable FSDP for MPS
dit_cpu_offload=True,
text_encoder_cpu_offload=True,
pin_cpu_memory=True,
disable_autocast=False,
num_gpus=1,
)
# Create sampling parameters with reduced number of frames
sampling_param = SamplingParam.from_pretrained("Wan-AI/Wan2.1-T2V-1.3B-Diffusers")
sampling_param.num_frames = 25 # Reduce from default 81 to 25 frames bc we have to use the SDPA attn backend for mps
sampling_param.height = 256
sampling_param.width = 256
prompt = ("A curious raccoon peers through a vibrant field of yellow sunflowers, its eyes "
"wide with interest. The playful yet serene atmosphere is complemented by soft "
"natural light filtering through the petals. Mid-shot, warm and cheerful tones.")
video = generator.generate_video(prompt, sampling_param=sampling_param)
prompt2 = ("A majestic lion strides across the golden savanna, its powerful frame "
"glistening under the warm afternoon sun. The tall grass ripples gently in "
"the breeze, enhancing the lion's commanding presence. The tone is vibrant, "
"embodying the raw energy of the wild. Low angle, steady tracking shot, "
"cinematic.")
video2 = generator.generate_video(prompt2, sampling_param=sampling_param)
if __name__ == "__main__":
main()
basic_ray.py
from fastvideo import VideoGenerator
# from fastvideo.configs.sample import SamplingParam
OUTPUT_PATH = "video_samples"
def main():
# FastVideo will automatically use the optimal default arguments for the
# model.
# If a local path is provided, FastVideo will make a best effort
# attempt to identify the optimal arguments.
generator = VideoGenerator.from_pretrained(
"Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
# FastVideo will automatically handle distributed setup
num_gpus=2,
use_fsdp_inference=True,
dit_cpu_offload=False,
vae_cpu_offload=False,
text_encoder_cpu_offload=True,
pin_cpu_memory=True, # set to false if low CPU RAM or hit obscure "CUDA error: Invalid argument"
distributed_executor_backend="ray",
# image_encoder_cpu_offload=False,
)
# Generate videos with the same simple API, regardless of GPU count
prompt = (
"A curious raccoon peers through a vibrant field of yellow sunflowers, its eyes "
"wide with interest. The playful yet serene atmosphere is complemented by soft "
"natural light filtering through the petals. Mid-shot, warm and cheerful tones."
)
video = generator.generate_video(prompt, output_path=OUTPUT_PATH, save_video=True)
# Generate another video with a different prompt, without reloading the
# model!
prompt2 = (
"A majestic lion strides across the golden savanna, its powerful frame "
"glistening under the warm afternoon sun. The tall grass ripples gently in "
"the breeze, enhancing the lion's commanding presence. The tone is vibrant, "
"embodying the raw energy of the wild. Low angle, steady tracking shot, "
"cinematic.")
video2 = generator.generate_video(prompt2, output_path=OUTPUT_PATH, save_video=True)
if __name__ == "__main__":
main()
basic_sd35_t2i.py
# SPDX-License-Identifier: Apache-2.0
from __future__ import annotations
import argparse
import os
import re
from typing import List
DEFAULT_PROMPTS = [
"a photo of a cat",
"a cinematic photo of a red panda wearing a tiny backpack, standing on a rainy neon-lit street at night, shallow depth of field, sharp focus, 35mm, bokeh",
]
def _safe_filename(text: str, max_len: int = 100) -> str:
"""
Make a stable, filesystem-friendly filename base.
VideoGenerator uses prompt[:100].strip() internally, so we mirror that,
but also remove path separators and other problematic characters.
"""
s = text[:max_len].strip()
s = s.replace(os.sep, "_")
if os.altsep:
s = s.replace(os.altsep, "_")
s = re.sub(r"\s+", " ", s)
s = re.sub(r"[^A-Za-z0-9 .,_-]", "_", s)
s = s.strip(" .")
return s or "prompt"
def _remove_existing_outputs(out_dir: str, filename_base: str) -> None:
"""
Ensure deterministic naming by deleting any existing outputs that would
cause VideoGenerator to append suffixes like _1, _2, etc.
"""
if not os.path.isdir(out_dir):
return
pattern = re.compile(rf"^{re.escape(filename_base)}(_\d+)?\.(mp4|png)$")
for fn in os.listdir(out_dir):
if pattern.match(fn):
try:
os.remove(os.path.join(out_dir, fn))
except FileNotFoundError:
pass
def parse_args() -> argparse.Namespace:
p = argparse.ArgumentParser(description="Run SD3.5 Medium text-to-image with FastVideo VideoGenerator.")
p.add_argument("--model-path", default="stabilityai/stable-diffusion-3.5-medium", help="Path to local diffusers-format SD3.5 weights directory.")
p.add_argument(
"--out-dir",
"--outdir",
default="outputs/sd35/samples",
help="Output directory for generated mp4 files.",
)
p.add_argument(
"--prompt",
action="append",
default=None,
help="Prompt text. Repeat --prompt multiple times to generate multiple samples.",
)
p.add_argument("--negative", default="lowres, blurry, jpeg artifacts, watermark, text", help="Negative prompt.")
p.add_argument(
"--backend",
default=None,
help="Set FASTVIDEO_ATTENTION_BACKEND (e.g. TORCH_SDPA). If omitted, respects the existing env var.",
)
p.add_argument("--seed", type=int, default=42, help="Base seed. Each prompt uses seed + prompt_idx.")
p.add_argument("--height", type=int, default=768, help="Output height.")
p.add_argument("--width", type=int, default=768, help="Output width.")
p.add_argument("--steps", type=int, default=28, help="Number of inference steps.")
p.add_argument("--guidance", type=float, default=6.0, help="Guidance scale.")
p.add_argument("--num-gpus", type=int, default=1, help="Number of GPUs to use.")
return p.parse_args()
def main() -> None:
args = parse_args()
prompts: List[str] = args.prompt if args.prompt else DEFAULT_PROMPTS
if args.backend:
os.environ["FASTVIDEO_ATTENTION_BACKEND"] = args.backend
from fastvideo import VideoGenerator
os.makedirs(args.out_dir, exist_ok=True)
init_kwargs = {
"num_gpus": args.num_gpus,
"workload_type": "t2i",
"sp_size": 1,
"tp_size": 1,
"dit_cpu_offload": False,
"dit_layerwise_offload": False,
"text_encoder_cpu_offload": False,
"vae_cpu_offload": False,
"image_encoder_cpu_offload": False,
"pin_cpu_memory": False,
"use_fsdp_inference": False,
}
generator = VideoGenerator.from_pretrained(model_path=args.model_path, **init_kwargs)
try:
for i, prompt in enumerate(prompts):
seed = args.seed + i
filename_base = f"sd35_{i:02d}_seed{seed}_{_safe_filename(prompt, max_len=80)}"
_remove_existing_outputs(args.out_dir, filename_base)
output_path = os.path.join(args.out_dir, f"{filename_base}.png")
print(f"[sd35] prompt_idx={i} seed={seed} output_path={output_path}")
generation_kwargs = {
"output_path": output_path,
"height": args.height,
"width": args.width,
"num_frames": 1,
"fps": 1,
"num_inference_steps": args.steps,
"guidance_scale": args.guidance,
"seed": seed,
"negative_prompt": args.negative,
"save_video": True,
}
generator.generate_video(prompt, **generation_kwargs)
print(f"[sd35] done. outputs written to: {args.out_dir}")
finally:
generator.shutdown()
if __name__ == "__main__":
main()
basic_self_forcing_causal.py
import os
import time
from fastvideo import VideoGenerator, SamplingParam
OUTPUT_PATH = "video_samples_causal"
def main():
# FastVideo will automatically use the optimal default arguments for the
# model.
# If a local path is provided, FastVideo will make a best effort
# attempt to identify the optimal arguments.
model_name = "wlsaidhi/SFWan2.1-T2V-1.3B-Diffusers"
generator = VideoGenerator.from_pretrained(
model_name,
# FastVideo will automatically handle distributed setup
num_gpus=1,
use_fsdp_inference=False, # set to True if GPU is out of memory
text_encoder_cpu_offload=False,
dit_cpu_offload=False,
)
sampling_param = SamplingParam.from_pretrained(model_name)
prompt = (
"A curious raccoon peers through a vibrant field of yellow sunflowers, its eyes "
"wide with interest. The playful yet serene atmosphere is complemented by soft "
"natural light filtering through the petals. Mid-shot, warm and cheerful tones."
)
video = generator.generate_video(prompt, output_path=OUTPUT_PATH, save_video=True, sampling_param=sampling_param)
if __name__ == "__main__":
main()
basic_self_forcing_causal_wan2_2_i2v.py
# NOTE: This is still a work in progress, and the checkpoints are not released yet.
from fastvideo import VideoGenerator, SamplingParam
import json
# from fastvideo.configs.sample import SamplingParam
OUTPUT_PATH = "video_samples_self_forcing_causal_wan2_2_14B_i2v"
def main():
# FastVideo will automatically use the optimal default arguments for the
# model.
# If a local path is provided, FastVideo will make a best effort
# attempt to identify the optimal arguments.
generator = VideoGenerator.from_pretrained(
"FastVideo/SFWan2.2-I2V-A14B-Preview-Diffusers",
# FastVideo will automatically handle distributed setup
num_gpus=1,
use_fsdp_inference=False, # set to True if GPU is out of memory
dit_cpu_offload=True, # DiT need to be offloaded for MoE
dit_precision="fp32",
vae_cpu_offload=False,
text_encoder_cpu_offload=True,
dmd_denoising_steps=[1000, 850, 700, 550, 350, 275, 200, 125],
# Set pin_cpu_memory to false if CPU RAM is limited and there're no frequent CPU-GPU transfer
pin_cpu_memory=True,
# image_encoder_cpu_offload=False,
)
sampling_param = SamplingParam.from_pretrained("FastVideo/SFWan2.2-I2V-A14B-Preview-Diffusers")
sampling_param.num_frames = 81
sampling_param.width = 832
sampling_param.height = 480
sampling_param.seed = 1000
with open("assets/prompts/mixkit_i2v.jsonl", "r") as f:
prompt_image_pairs = json.load(f)
for prompt_image_pair in prompt_image_pairs:
prompt = prompt_image_pair["prompt"]
image_path = prompt_image_pair["image_path"]
_ = generator.generate_video(prompt, image_path=image_path, output_path=OUTPUT_PATH, save_video=True, sampling_param=sampling_param)
if __name__ == "__main__":
main()
basic_self_forcing_causal_wan2_2_t2v.py
# NOTE: This is still a work in progress, and the checkpoints are not released yet.
from fastvideo import VideoGenerator
# from fastvideo.configs.sample import SamplingParam
OUTPUT_PATH = "video_samples_self_forcing_causal_wan2_2_14B_t2v"
def main():
# FastVideo will automatically use the optimal default arguments for the
# model.
# If a local path is provided, FastVideo will make a best effort
# attempt to identify the optimal arguments.
generator = VideoGenerator.from_pretrained(
"rand0nmr/SFWan2.2-T2V-A14B-Diffusers",
# FastVideo will automatically handle distributed setup
num_gpus=1,
use_fsdp_inference=False, # set to True if GPU is out of memory
dit_cpu_offload=True, # DiT need to be offloaded for MoE
vae_cpu_offload=False,
text_encoder_cpu_offload=True,
dmd_denoising_steps=[1000, 850, 700, 550, 350, 275, 200, 125],
# Set pin_cpu_memory to false if CPU RAM is limited and there're no frequent CPU-GPU transfer
pin_cpu_memory=True,
init_weights_from_safetensors="/mnt/sharefs/users/hao.zhang/wei/SFwan2.2_distill_self_forcing_release_cfg2/checkpoint-246_weight_only/generator_inference_transformer/",
init_weights_from_safetensors_2="/mnt/sharefs/users/hao.zhang/wei/SFwan2.2_distill_self_forcing_release_cfg2/checkpoint-246_weight_only/generator_2_inference_transformer/",
num_frame_per_block=7,
# image_encoder_cpu_offload=False,
)
# sampling_param = SamplingParam.from_pretrained("Wan-AI/Wan2.1-T2V-1.3B-Diffusers")
# sampling_param.num_frames = 45
# sampling_param.image_path = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/astronaut.jpg"
# Generate videos with the same simple API, regardless of GPU count
prompt = (
"A curious raccoon peers through a vibrant field of yellow sunflowers, its eyes "
"wide with interest. The playful yet serene atmosphere is complemented by soft "
"natural light filtering through the petals. Mid-shot, warm and cheerful tones."
)
_ = generator.generate_video(prompt, output_path=OUTPUT_PATH, save_video=True, num_frames=81)
if __name__ == "__main__":
main()
basic_turbodiffusion.py
import os
# Set SLA attention backend BEFORE fastvideo imports
os.environ["FASTVIDEO_ATTENTION_BACKEND"] = "SLA_ATTN"
from fastvideo import VideoGenerator
OUTPUT_PATH = "video_samples_turbodiffusion"
def main() -> None:
# TurboDiffusion: 1-4 step video generation using RCM scheduler + SLA attention
# FastVideo will automatically use TurboDiffusionPipeline when specified
generator = VideoGenerator.from_pretrained(
"loayrashid/TurboWan2.1-T2V-1.3B-Diffusers",
# FastVideo will automatically handle distributed setup
num_gpus=1,
use_fsdp_inference=False, # set to True if GPU is out of memory
# set to false if using RTX 4090
# pin_cpu_memory=False,
)
# Generate videos with the same simple API, regardless of GPU count
# TurboDiffusion defaults: guidance_scale=1.0 and num_inference_steps=4 (from config)
prompt = (
"A curious raccoon peers through a vibrant field of yellow sunflowers, its eyes "
"wide with interest. The playful yet serene atmosphere is complemented by soft "
"natural light filtering through the petals. Mid-shot, warm and cheerful tones."
)
video = generator.generate_video(
prompt,
output_path=OUTPUT_PATH,
save_video=True,
seed=42,
)
# Generate another video with a different prompt, without reloading the model!
prompt2 = (
"A majestic lion strides across the golden savanna, its powerful frame "
"glistening under the warm afternoon sun. The tall grass ripples gently in "
"the breeze, enhancing the lion's commanding presence. The tone is vibrant, "
"embodying the raw energy of the wild. Low angle, steady tracking shot, "
"cinematic."
)
video2 = generator.generate_video(
prompt2,
output_path=OUTPUT_PATH,
save_video=True,
seed=42,
)
if __name__ == "__main__":
main()
basic_turbodiffusion_14b.py
import os
# Set SLA attention backend BEFORE fastvideo imports
os.environ["FASTVIDEO_ATTENTION_BACKEND"] = "SLA_ATTN"
from fastvideo import VideoGenerator
OUTPUT_PATH = "video_samples_turbodiffusion_14B"
def main() -> None:
# TurboDiffusion 14B: 1-4 step video generation using RCM scheduler + SLA attention
# FastVideo will automatically use TurboDiffusionPipeline when specified
generator = VideoGenerator.from_pretrained(
"loayrashid/TurboWan2.1-T2V-14B-Diffusers",
# 14B model needs more GPUs
num_gpus=2,
)
prompt = (
"A curious raccoon peers through a vibrant field of yellow sunflowers, its eyes "
"wide with interest. The playful yet serene atmosphere is complemented by soft "
"natural light filtering through the petals. Mid-shot, warm and cheerful tones."
)
video = generator.generate_video(
prompt,
output_path=OUTPUT_PATH,
save_video=True,
seed=42,
)
# Generate another video with a different prompt, without reloading the model!
prompt2 = (
"A majestic lion strides across the golden savanna, its powerful frame "
"glistening under the warm afternoon sun. The tall grass ripples gently in "
"the breeze, enhancing the lion's commanding presence. The tone is vibrant, "
"embodying the raw energy of the wild. Low angle, steady tracking shot, "
"cinematic."
)
video2 = generator.generate_video(
prompt2,
output_path=OUTPUT_PATH,
save_video=True,
seed=42,
)
if __name__ == "__main__":
main()
basic_turbodiffusion_i2v.py
import os
# Set SLA attention backend BEFORE fastvideo imports
os.environ["FASTVIDEO_ATTENTION_BACKEND"] = "SLA_ATTN"
from fastvideo import VideoGenerator
# Use local model path
MODEL_PATH = "loayrashid/TurboWan2.2-I2V-A14B-Diffusers"
OUTPUT_PATH = "video_samples_turbodiffusion_i2v"
def main() -> None:
# TurboDiffusion I2V: 1-4 step image-to-video generation
generator = VideoGenerator.from_pretrained(
MODEL_PATH,
num_gpus=2,
)
# Example prompt and image for I2V
prompt = ("Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside.")
# Use an example image path
image_path = "https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/wan_i2v_input.JPG"
video = generator.generate_video(
prompt,
image_path=image_path,
output_path=OUTPUT_PATH,
save_video=True,
seed=42,
)
if __name__ == "__main__":
main()
basic_wan2_2.py
from fastvideo import VideoGenerator
# from fastvideo.configs.sample import SamplingParam
OUTPUT_PATH = "video_samples_wan2_2_14B_t2v"
def main():
# FastVideo will automatically use the optimal default arguments for the
# model.
# If a local path is provided, FastVideo will make a best effort
# attempt to identify the optimal arguments.
generator = VideoGenerator.from_pretrained(
"Wan-AI/Wan2.2-T2V-A14B-Diffusers",
# FastVideo will automatically handle distributed setup
num_gpus=2,
use_fsdp_inference=False, # set to True if GPU is out of memory
dit_cpu_offload=True, # DiT need to be offloaded for MoE
vae_cpu_offload=False,
text_encoder_cpu_offload=True,
# Set pin_cpu_memory to false if CPU RAM is limited and there're no frequent CPU-GPU transfer
pin_cpu_memory=True,
# image_encoder_cpu_offload=False,
)
# sampling_param = SamplingParam.from_pretrained("Wan-AI/Wan2.1-T2V-1.3B-Diffusers")
# sampling_param.num_frames = 45
# sampling_param.image_path = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/astronaut.jpg"
# Generate videos with the same simple API, regardless of GPU count
prompt = (
"A curious raccoon peers through a vibrant field of yellow sunflowers, its eyes "
"wide with interest. The playful yet serene atmosphere is complemented by soft "
"natural light filtering through the petals. Mid-shot, warm and cheerful tones."
)
_ = generator.generate_video(prompt, output_path=OUTPUT_PATH, save_video=True, height=720, width=1280, num_frames=81)
# video = generator.generate_video(prompt, sampling_param=sampling_param, output_path="wan_t2v_videos/")
# Generate another video with a different prompt, without reloading the
# model!
prompt2 = (
"A majestic lion strides across the golden savanna, its powerful frame "
"glistening under the warm afternoon sun. The tall grass ripples gently in "
"the breeze, enhancing the lion's commanding presence. The tone is vibrant, "
"embodying the raw energy of the wild. Low angle, steady tracking shot, "
"cinematic.")
_ = generator.generate_video(prompt2, output_path=OUTPUT_PATH, save_video=True, height=720, width=1280, num_frames=81)
if __name__ == "__main__":
main()
basic_wan2_2_Fun.py
from fastvideo import VideoGenerator
# from fastvideo.configs.sample import SamplingParam
OUTPUT_PATH = "video_samples_wan2_1_Fun"
OUTPUT_NAME = "wan2.1_test"
def main():
# FastVideo will automatically use the optimal default arguments for the
# model.
# If a local path is provided, FastVideo will make a best effort
# attempt to identify the optimal arguments.
generator = VideoGenerator.from_pretrained(
"IRMChen/Wan2.1-Fun-1.3B-Control-Diffusers",
# "alibaba-pai/Wan2.2-Fun-A14B-Control",
# FastVideo will automatically handle distributed setup
num_gpus=1,
use_fsdp_inference=False, # set to True if GPU is out of memory
dit_cpu_offload=True, # DiT need to be offloaded for MoE
vae_cpu_offload=False,
text_encoder_cpu_offload=True,
# Set pin_cpu_memory to false if CPU RAM is limited and there're no frequent CPU-GPU transfer
pin_cpu_memory=True,
# image_encoder_cpu_offload=False,
)
prompt = "一位年轻女性穿着一件粉色的连衣裙,裙子上有白色的装饰和粉色的纽扣。她的头发是紫色的,头上戴着一个红色的大蝴蝶结,显得非常可爱和精致。她还戴着一个红色的领结,整体造型充满了少女感和活力。她的表情温柔,双手轻轻交叉放在身前,姿态优雅。背景是简单的灰色,没有任何多余的装饰,使得人物更加突出。她的妆容清淡自然,突显了她的清新气质。整体画面给人一种甜美、梦幻的感觉,仿佛置身于童话世界中。"
negative_prompt = "色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走"
# prompt = "A young woman with beautiful, clear eyes and blonde hair stands in the forest, wearing a white dress and a crown. Her expression is serene, reminiscent of a movie star, with fair and youthful skin. Her brown long hair flows in the wind. The video quality is very high, with a clear view. High quality, masterpiece, best quality, high resolution, ultra-fine, fantastical."
# negative_prompt = "Twisted body, limb deformities, text captions, comic, static, ugly, error, messy code."
image_path = "https://pai-aigc-photog.oss-cn-hangzhou.aliyuncs.com/wan_fun/asset_Wan2_2/v1.0/8.png"
control_video_path = "https://pai-aigc-photog.oss-cn-hangzhou.aliyuncs.com/wan_fun/asset_Wan2_2/v1.0/pose.mp4"
video = generator.generate_video(prompt, negative_prompt=negative_prompt, image_path=image_path, video_path=control_video_path, output_path=OUTPUT_PATH, output_video_name=OUTPUT_NAME, save_video=True)
if __name__ == "__main__":
main()
basic_wan2_2_i2v.py
from fastvideo import VideoGenerator
# from fastvideo.configs.sample import SamplingParam
OUTPUT_PATH = "video_samples_wan2_2_14B_i2v"
def main():
# FastVideo will automatically use the optimal default arguments for the
# model.
# If a local path is provided, FastVideo will make a best effort
# attempt to identify the optimal arguments.
generator = VideoGenerator.from_pretrained(
"Wan-AI/Wan2.2-I2V-A14B-Diffusers",
# FastVideo will automatically handle distributed setup
num_gpus=1,
use_fsdp_inference=False, # set to True if GPU is out of memory
dit_cpu_offload=True, # DiT need to be offloaded for MoE
vae_cpu_offload=False,
text_encoder_cpu_offload=True,
# Set pin_cpu_memory to false if CPU RAM is limited and there're no frequent CPU-GPU transfer
pin_cpu_memory=True,
# image_encoder_cpu_offload=False,
)
prompt = "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside."
image_path = "https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/wan_i2v_input.JPG"
video = generator.generate_video(prompt, image_path=image_path, output_path=OUTPUT_PATH, save_video=True, height=832, width=480, num_frames=81)
if __name__ == "__main__":
main()
basic_wan2_2_ti2v.py
from fastvideo import VideoGenerator
OUTPUT_PATH = "video_samples_wan2_2_5B_ti2v"
def main():
# FastVideo will automatically use the optimal default arguments for the
# model.
# If a local path is provided, FastVideo will make a best effort
# attempt to identify the optimal arguments.
model_name = "Wan-AI/Wan2.2-TI2V-5B-Diffusers"
generator = VideoGenerator.from_pretrained(
model_name,
# FastVideo will automatically handle distributed setup
num_gpus=1,
use_fsdp_inference=False, # set to True if GPU is out of memory
dit_cpu_offload=True,
vae_cpu_offload=False,
text_encoder_cpu_offload=True,
pin_cpu_memory=True, # set to false if low CPU RAM or hit obscure "CUDA error: Invalid argument"
# image_encoder_cpu_offload=False,
)
# I2V is triggered just by passing in an image_path argument
prompt = "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside."
image_path = "https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/wan_i2v_input.JPG"
video = generator.generate_video(prompt, output_path=OUTPUT_PATH, save_video=True, image_path=image_path)
# Generate another video with a different prompt, without reloading the
# model!
# T2V mode
prompt2 = (
"A majestic lion strides across the golden savanna, its powerful frame "
"glistening under the warm afternoon sun. The tall grass ripples gently in "
"the breeze, enhancing the lion's commanding presence. The tone is vibrant, "
"embodying the raw energy of the wild. Low angle, steady tracking shot, "
"cinematic.")
video2 = generator.generate_video(prompt2, output_path=OUTPUT_PATH, save_video=True)
if __name__ == "__main__":
main()