Source: examples/inference/basic
Basic Video Generation Tutorial¶
The VideoGenerator class provides the primary Python interface for doing offline video generation, which is interacting with a diffusion pipeline without using a separate inference api server.
Requirements¶
- At least a single NVIDIA GPU with CUDA 12.4.
- Python 3.10-3.12
Installation¶
If you have not installed FastVideo, please following these instructions first.
Usage¶
The first script in this example shows the most basic usage of FastVideo. If you are new to Python and FastVideo, you should start here.
# if you have not cloned the directory:
git clone https://github.com/hao-ai-lab/FastVideo.git && cd FastVideo
python examples/inference/basic/basic.py
For an example on Apple silicon:
For an example running DMD+VSA inference:
For the typed config/request path added during the inference API refactor:
Basic Walkthrough¶
All you need to generate videos using multi-gpus from state-of-the-art diffusion pipelines is the following few lines!
from fastvideo import VideoGenerator
def main():
generator = VideoGenerator.from_pretrained(
"Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
num_gpus=1,
)
prompt = ("A curious raccoon peers through a vibrant field of yellow sunflowers, its eyes "
"wide with interest. The playful yet serene atmosphere is complemented by soft "
"natural light filtering through the petals. Mid-shot, warm and cheerful tones.")
video = generator.generate_video(prompt)
if __name__ == "__main__":
main()
Additional Files¶
basic.py
from fastvideo import VideoGenerator
# from fastvideo.api.sampling_param import SamplingParam
OUTPUT_PATH = "video_samples"
def main():
# FastVideo will automatically use the optimal default arguments for the
# model.
# If a local path is provided, FastVideo will make a best effort
# attempt to identify the optimal arguments.
generator = VideoGenerator.from_pretrained(
"Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
# FastVideo will automatically handle distributed setup
num_gpus=1,
use_fsdp_inference=False, # set to True if GPU is out of memory
dit_cpu_offload=False,
vae_cpu_offload=False,
text_encoder_cpu_offload=True,
pin_cpu_memory=True, # set to false if low CPU RAM or hit obscure "CUDA error: Invalid argument"
# image_encoder_cpu_offload=False,
)
# sampling_param = SamplingParam.from_pretrained("Wan-AI/Wan2.1-T2V-1.3B-Diffusers")
# sampling_param.num_frames = 45
# sampling_param.image_path = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/astronaut.jpg"
# Generate videos with the same simple API, regardless of GPU count
prompt = (
"A curious raccoon peers through a vibrant field of yellow sunflowers, its eyes "
"wide with interest. The playful yet serene atmosphere is complemented by soft "
"natural light filtering through the petals. Mid-shot, warm and cheerful tones."
)
video = generator.generate_video(prompt, output_path=OUTPUT_PATH, save_video=True)
# video = generator.generate_video(prompt, sampling_param=sampling_param, output_path="wan_t2v_videos/")
# Generate another video with a different prompt, without reloading the
# model!
prompt2 = (
"A majestic lion strides across the golden savanna, its powerful frame "
"glistening under the warm afternoon sun. The tall grass ripples gently in "
"the breeze, enhancing the lion's commanding presence. The tone is vibrant, "
"embodying the raw energy of the wild. Low angle, steady tracking shot, "
"cinematic.")
video2 = generator.generate_video(prompt2, output_path=OUTPUT_PATH, save_video=True)
if __name__ == "__main__":
main()
basic_cosmos2_5_i2w.py
# SPDX-License-Identifier: Apache-2.0
from fastvideo import VideoGenerator
from fastvideo.api.sampling_param import SamplingParam
def main():
# Point this to your local diffusers model dir (or replace with a HF model ID).
model_path = "KyleShao/Cosmos-Predict2.5-2B-Diffusers"
generator = VideoGenerator.from_pretrained(
model_path,
num_gpus=1,
use_fsdp_inference=False, # set True if GPU is out of memory
dit_cpu_offload=False,
vae_cpu_offload=False,
text_encoder_cpu_offload=True,
pin_cpu_memory=True,
)
sampling_param = SamplingParam.from_pretrained(model_path)
# image2world example from official repo
image_path = "assets/images/bus_terminal.jpg"
prompt = (
"A nighttime city bus terminal gradually shifts from stillness to subtle movement. "
"At first, multiple double-decker buses are parked under the glow of overhead lights, "
"with a central bus labeled '87D' facing forward and stationary. "
"As the video progresses, the bus in the middle moves ahead slowly, its headlights brightening the surrounding area "
"and casting reflections onto adjacent vehicles. "
"The motion creates space in the lineup, signaling activity within the otherwise quiet station. "
"It then comes to a smooth stop, resuming its position in line. "
"Overhead signage in Chinese characters remains illuminated, enhancing the vibrant, urban night scene."
)
generator.generate_video(
prompt,
sampling_param=sampling_param,
image_path=str(image_path),
num_cond_frames=1,
output_path="outputs_video/cosmos2_5_i2w.mp4",
save_video=True,
)
generator.shutdown()
if __name__ == "__main__":
main()
basic_cosmos2_5_t2w.py
from fastvideo import VideoGenerator
from fastvideo.api.sampling_param import SamplingParam
def main():
# Point this to your local diffusers model dir (or replace with a HF model ID).
model_path = "KyleShao/Cosmos-Predict2.5-2B-Diffusers"
generator = VideoGenerator.from_pretrained(
model_path,
num_gpus=1,
use_fsdp_inference=False, # set True if GPU is out of memory
dit_cpu_offload=False,
vae_cpu_offload=False,
text_encoder_cpu_offload=True,
pin_cpu_memory=True,
)
# Load default sampling parameters (negative_prompt, resolution, steps, etc.)
sampling_param = SamplingParam.from_pretrained(model_path)
prompt = (
"A high-definition video captures the precision of robotic welding in an industrial setting. "
"The first frame showcases a robotic arm, equipped with a welding torch, positioned over a large metal structure. "
"The welding process is in full swing, with bright sparks and intense light illuminating the scene, "
"creating a vivid display of blue and white hues. "
"A significant amount of smoke billows around the welding area, partially obscuring the view but emphasizing the heat and activity. "
"The background reveals parts of the workshop environment, including a ventilation system and various pieces of machinery, "
"indicating a busy and functional industrial workspace. "
"As the video progresses, the robotic arm maintains its steady position, continuing the welding process and moving to its left. "
"The welding torch consistently emits sparks and light, and the smoke continues to rise, diffusing slightly as it moves upward. "
"The metal surface beneath the torch shows ongoing signs of heating and melting. "
"The scene retains its industrial ambiance, with the welding sparks and smoke dominating the visual field, "
"underscoring the ongoing nature of the welding operation."
)
generator.generate_video(
prompt,
sampling_param=sampling_param,
output_path="outputs_video/cosmos2_5_t2w.mp4",
save_video=True,
)
generator.shutdown()
if __name__ == "__main__":
main()
basic_cosmos2_5_v2w.py
# SPDX-License-Identifier: Apache-2.0
from fastvideo import VideoGenerator
from fastvideo.api.sampling_param import SamplingParam
def main():
# Point this to your local diffusers model dir (or replace with a HF model ID).
model_path = "KyleShao/Cosmos-Predict2.5-2B-Diffusers"
generator = VideoGenerator.from_pretrained(
model_path,
num_gpus=1,
use_fsdp_inference=False, # set True if GPU is out of memory
dit_cpu_offload=False,
vae_cpu_offload=False,
text_encoder_cpu_offload=True,
pin_cpu_memory=True,
)
sampling_param = SamplingParam.from_pretrained(model_path)
# video2world example from official repo
video_path = "assets/videos/robot_pouring.mp4"
prompt = (
"A robotic arm, primarily white with black joints and cables, is shown in a clean, modern indoor setting with a white tabletop. "
"The arm, equipped with a gripper holding a small, light green pitcher, is positioned above a clear glass containing a reddish-brown liquid and a spoon. "
"The robotic arm is in the process of pouring a transparent liquid into the glass. "
"To the left of the pitcher, there is an opened jar with a similar reddish-brown substance visible through its transparent body. "
"In the background, a vase with white flowers and a brown couch are partially visible, adding to the contemporary ambiance. "
"The lighting is bright, casting soft shadows on the table. "
"The robotic arm's movements are smooth and controlled, demonstrating precision in its task. "
"As the video progresses, the robotic arm completes the pour, leaving the glass half-filled with the reddish-brown liquid. "
"The jar remains untouched throughout the sequence, and the spoon inside the glass remains stationary. "
"The other robotic arm on the right side also stays stationary throughout the video. "
"The final frame captures the robotic arm with the pitcher finishing the pour, with the glass now filled to a higher level, while the pitcher is slightly tilted but still held securely by the gripper."
)
generator.generate_video(
prompt,
sampling_param=sampling_param,
video_path=str(video_path),
num_cond_frames=1,
output_path="outputs_video/cosmos2_5_v2w.mp4",
save_video=True,
)
generator.shutdown()
if __name__ == "__main__":
main()
basic_dmd.py
import os
import time
from fastvideo import VideoGenerator
from fastvideo.api.sampling_param import SamplingParam
OUTPUT_PATH = "video_samples_dmd2"
def main():
os.environ["FASTVIDEO_ATTENTION_BACKEND"] = "VIDEO_SPARSE_ATTN"
load_start_time = time.perf_counter()
model_name = "FastVideo/FastWan2.1-T2V-1.3B-Diffusers"
generator = VideoGenerator.from_pretrained(
model_name,
# FastVideo will automatically handle distributed setup
num_gpus=1,
use_fsdp_inference=False, # set to True if GPU is out of memory
# Adjust these offload parameters if you have < 32GB of VRAM
text_encoder_cpu_offload=True,
pin_cpu_memory=True, # set to false if low CPU RAM or hit obscure "CUDA error: Invalid argument"
dit_cpu_offload=False,
vae_cpu_offload=False,
VSA_sparsity=0.8,
)
load_end_time = time.perf_counter()
load_time = load_end_time - load_start_time
sampling_param = SamplingParam.from_pretrained(model_name)
prompt = (
"A neon-lit alley in futuristic Tokyo during a heavy rainstorm at night. The puddles reflect glowing signs in kanji, advertising ramen, karaoke, and VR arcades. A woman in a translucent raincoat walks briskly with an LED umbrella. Steam rises from a street food cart, and a cat darts across the screen. Raindrops are visible on the camera lens, creating a cinematic bokeh effect."
)
start_time = time.perf_counter()
video = generator.generate_video(prompt, output_path=OUTPUT_PATH, save_video=True, sampling_param=sampling_param)
end_time = time.perf_counter()
gen_time = end_time - start_time
# Generate another video with a different prompt, without reloading the
# model!
prompt2 = (
"A majestic lion strides across the golden savanna, its powerful frame "
"glistening under the warm afternoon sun. The tall grass ripples gently in "
"the breeze, enhancing the lion's commanding presence. The tone is vibrant, "
"embodying the raw energy of the wild. Low angle, steady tracking shot, "
"cinematic.")
start_time = time.perf_counter()
video2 = generator.generate_video(prompt2, output_path=OUTPUT_PATH, save_video=True)
end_time = time.perf_counter()
gen_time2 = end_time - start_time
print(f"Time taken to load model: {load_time} seconds")
print(f"Time taken to generate video: {gen_time} seconds")
print(f"Time taken to generate video2: {gen_time2} seconds")
if __name__ == "__main__":
main()
basic_dmd_new_api.py
import os
import time
from fastvideo import VideoGenerator
from fastvideo.api import (
EngineConfig,
GenerationRequest,
GeneratorConfig,
OffloadConfig,
OutputConfig,
PipelineSelection,
)
OUTPUT_PATH = "video_samples_dmd2_typed"
def main():
os.environ["FASTVIDEO_ATTENTION_BACKEND"] = "VIDEO_SPARSE_ATTN"
model_name = "FastVideo/FastWan2.1-T2V-1.3B-Diffusers"
generator_config = GeneratorConfig(
model_path=model_name,
engine=EngineConfig(
num_gpus=1,
use_fsdp_inference=False,
offload=OffloadConfig(
text_encoder=True,
pin_cpu_memory=True,
dit=False,
vae=False,
),
),
# PR 2 still routes a few advanced inference knobs through the
# compatibility bridge until they get first-class typed fields.
pipeline=PipelineSelection(
experimental={
"VSA_sparsity": 0.8,
},
),
)
load_start_time = time.perf_counter()
generator = VideoGenerator.from_config(generator_config)
load_end_time = time.perf_counter()
load_time = load_end_time - load_start_time
prompt = (
"A neon-lit alley in futuristic Tokyo during a heavy rainstorm at night. "
"The puddles reflect glowing signs in kanji, advertising ramen, karaoke, "
"and VR arcades. A woman in a translucent raincoat walks briskly with an "
"LED umbrella. Steam rises from a street food cart, and a cat darts "
"across the screen. Raindrops are visible on the camera lens, creating "
"a cinematic bokeh effect."
)
request = GenerationRequest(
prompt=prompt,
output=OutputConfig(
output_path=OUTPUT_PATH,
save_video=True,
return_frames=False,
),
)
start_time = time.perf_counter()
result = generator.generate(request)
end_time = time.perf_counter()
gen_time = end_time - start_time
prompt2 = (
"A majestic lion strides across the golden savanna, its powerful frame "
"glistening under the warm afternoon sun. The tall grass ripples gently "
"in the breeze, enhancing the lion's commanding presence. The tone is "
"vibrant, embodying the raw energy of the wild. Low angle, steady "
"tracking shot, cinematic."
)
request2 = GenerationRequest(
prompt=prompt2,
output=OutputConfig(
output_path=OUTPUT_PATH,
save_video=True,
return_frames=False,
),
)
start_time = time.perf_counter()
result2 = generator.generate(request2)
end_time = time.perf_counter()
gen_time2 = end_time - start_time
print(f"Time taken to load model: {load_time} seconds")
print(f"Time taken to generate video: {gen_time} seconds")
print(f"First output written to: {result.video_path}")
print(f"Time taken to generate video2: {gen_time2} seconds")
print(f"Second output written to: {result2.video_path}")
if __name__ == "__main__":
main()
basic_flux2.py
# SPDX-License-Identifier: Apache-2.0
"""Run full Flux2 text-to-image generation through FastVideo.
User story:
"I have a local or HF Diffusers-format full Flux2 checkpoint and want a
minimal text-to-image generation command that uses embedded guidance."
"""
import argparse
import os
from pathlib import Path
from fastvideo import VideoGenerator
from fastvideo.api import (
ComponentConfig,
EngineConfig,
GenerationRequest,
GeneratorConfig,
OffloadConfig,
OutputConfig,
ParallelismConfig,
PipelineSelection,
SamplingConfig,
)
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Run full Flux2 text-to-image generation.")
parser.add_argument(
"--model-path",
default="black-forest-labs/FLUX.2-dev",
help="HF id or local diffusers-format full Flux2 weights directory.",
)
parser.add_argument(
"--output",
default="outputs/flux2/flux2.png",
help="Output PNG path.",
)
parser.add_argument(
"--prompt",
default="a photo of a banana on a wooden table, studio lighting",
help="Text prompt.",
)
parser.add_argument("--height", type=int, default=1024)
parser.add_argument("--width", type=int, default=1024)
parser.add_argument("--steps", type=int, default=50)
parser.add_argument("--guidance-scale", type=float, default=4.0)
parser.add_argument("--max-sequence-length", type=int, default=None)
parser.add_argument("--seed", type=int, default=0)
parser.add_argument("--num-gpus", type=int, default=1)
parser.add_argument("--tp-size", type=int, default=None)
parser.add_argument("--sp-size", type=int, default=None)
parser.add_argument(
"--backend",
default=None,
help="Set FASTVIDEO_ATTENTION_BACKEND, for example TORCH_SDPA.",
)
return parser.parse_args()
def main() -> None:
args = parse_args()
if args.backend:
os.environ["FASTVIDEO_ATTENTION_BACKEND"] = args.backend
output = Path(args.output)
output.parent.mkdir(parents=True, exist_ok=True)
tp_size = args.tp_size if args.tp_size is not None else (
args.num_gpus if args.num_gpus > 1 else 1
)
sp_size = args.sp_size if args.sp_size is not None else (
1 if args.num_gpus > 1 else args.num_gpus
)
generator_config = GeneratorConfig(
model_path=args.model_path,
engine=EngineConfig(
num_gpus=args.num_gpus,
parallelism=ParallelismConfig(tp_size=tp_size, sp_size=sp_size),
use_fsdp_inference=False,
offload=OffloadConfig(
dit=False,
vae=True,
text_encoder=True,
pin_cpu_memory=False,
),
),
pipeline=PipelineSelection(
workload_type="t2i",
components=ComponentConfig(override_pipeline_cls_name="Flux2Pipeline"),
),
)
generator = VideoGenerator.from_config(generator_config)
try:
sampling = SamplingConfig(
height=args.height,
width=args.width,
num_frames=1,
fps=1,
num_inference_steps=args.steps,
guidance_scale=args.guidance_scale,
seed=args.seed,
)
extensions = {}
if args.max_sequence_length is not None:
extensions["max_sequence_length"] = args.max_sequence_length
request = GenerationRequest(
prompt=args.prompt,
sampling=sampling,
output=OutputConfig(
output_path=str(output),
save_video=True,
return_frames=False,
),
extensions=extensions,
)
generator.generate(request)
finally:
generator.shutdown()
if __name__ == "__main__":
main()
basic_flux2_klein.py
# SPDX-License-Identifier: Apache-2.0
"""Run Flux2 Klein text-to-image generation through FastVideo.
User story:
"I need a short local smoke for the Flux2 Klein checkpoint before wiring it
into an image workflow. Use the model's distilled four-step defaults and
write a single PNG so I can compare the output against the reference."
"""
import argparse
import os
from fastvideo import VideoGenerator
from fastvideo.api import (
EngineConfig,
GenerationRequest,
GeneratorConfig,
OffloadConfig,
OutputConfig,
PipelineSelection,
SamplingConfig,
)
DEFAULT_PROMPT = "a brushed steel espresso machine on a marble counter, morning window light"
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Run Flux2 Klein text-to-image generation.")
parser.add_argument(
"--model-path",
default="black-forest-labs/FLUX.2-klein-4B",
help="HF id or local diffusers-format Flux2 Klein weights directory.",
)
parser.add_argument(
"--output-path",
default="outputs/flux2/flux2_klein.png",
help="PNG output path or output directory.",
)
parser.add_argument("--prompt", default=DEFAULT_PROMPT, help="Prompt text.")
parser.add_argument("--seed", type=int, default=0, help="Generation seed.")
parser.add_argument("--height", type=int, default=1024, help="Output image height.")
parser.add_argument("--width", type=int, default=1024, help="Output image width.")
parser.add_argument("--steps", type=int, default=4, help="Number of denoising steps.")
parser.add_argument("--num-gpus", type=int, default=1, help="Number of GPUs to use.")
parser.add_argument(
"--backend",
default=None,
help="Set FASTVIDEO_ATTENTION_BACKEND, for example TORCH_SDPA.",
)
return parser.parse_args()
def main() -> None:
args = parse_args()
if args.backend:
os.environ["FASTVIDEO_ATTENTION_BACKEND"] = args.backend
generator_config = GeneratorConfig(
model_path=args.model_path,
engine=EngineConfig(
num_gpus=args.num_gpus,
use_fsdp_inference=False,
offload=OffloadConfig(
dit=False,
vae=True,
text_encoder=True,
pin_cpu_memory=False,
),
),
pipeline=PipelineSelection(workload_type="t2i"),
)
generator = VideoGenerator.from_config(generator_config)
try:
request = GenerationRequest(
prompt=args.prompt,
sampling=SamplingConfig(
height=args.height,
width=args.width,
num_frames=1,
fps=1,
num_inference_steps=args.steps,
guidance_scale=1.0,
seed=args.seed,
),
output=OutputConfig(
output_path=args.output_path,
save_video=True,
),
)
generator.generate(request)
finally:
generator.shutdown()
if __name__ == "__main__":
main()
basic_gamecraft.py
# SPDX-License-Identifier: Apache-2.0
"""
Basic inference script for HunyuanGameCraft video generation.
HunyuanGameCraft generates game-like videos with camera/action control.
It takes an optional image input and generates video with camera motion
based on simple action commands (forward, left, right, backward, rotations).
Available actions:
- forward (w): Move camera forward
- backward (s): Move camera backward
- left (a): Move camera left (strafe)
- right (d): Move camera right (strafe)
- left_rot: Rotate camera left (pan)
- right_rot: Rotate camera right (pan)
- up_rot: Rotate camera up (tilt)
- down_rot: Rotate camera down (tilt)
T2V vs I2V:
- Default: I2V (uses a default reference image). Set GAMECRAFT_I2V_IMAGE to a
URL or path to use a different image.
- T2V only (no reference image): run with GAMECRAFT_I2V_IMAGE= (empty).
"""
import os
import torch
from fastvideo import VideoGenerator
from fastvideo.models.camera import create_camera_trajectory
# Model configuration (use GAMECRAFT_MODEL_PATH for local weights)
MODEL_PATH = os.environ.get("GAMECRAFT_MODEL_PATH", "FastVideo/HunyuanGameCraft-Diffusers")
# Default prompts for demo
DEFAULT_PROMPTS = {
"village": "A charming medieval village with cobblestone streets, thatched-roof houses, and vibrant flower gardens under a bright blue sky.",
"temple": "A majestic ancient temple stands under a clear blue sky, its grandeur highlighted by towering Doric columns and intricate architectural details.",
"forest": "A lush green forest with tall trees, dappled sunlight filtering through the leaves, and a winding dirt path.",
"beach": "A tropical beach with crystal clear turquoise water, white sand, and palm trees swaying in the breeze.",
}
# I2V: default reference image (URL). Can override with a local path.
DEFAULT_I2V_IMAGE_URL = (
"https://huggingface.co/datasets/huggingface/documentation-images/"
"resolve/main/diffusers/astronaut.jpg"
)
DEFAULT_I2V_PROMPT = (
"An astronaut hatching from an egg, on the surface of the moon, "
"the darkness and depth of space realised in the background."
)
OUTPUT_PATH = "video_samples_gamecraft"
def main():
# Initialize generator
# FastVideo will automatically download weights from HuggingFace
generator = VideoGenerator.from_pretrained(
MODEL_PATH,
num_gpus=1,
use_fsdp_inference=True,
dit_cpu_offload=True,
vae_cpu_offload=True,
text_encoder_cpu_offload=True,
pin_cpu_memory=True,
)
# Video parameters
height = 704
width = 1280
num_frames = 33
action = "forward"
action_speed = 0.2
# Create camera trajectory (Plücker coordinates)
camera_states = create_camera_trajectory(
action=action,
height=height,
width=width,
num_frames=num_frames,
action_speed=action_speed,
dtype=torch.bfloat16,
)
print(f"Camera states shape: {camera_states.shape}")
# I2V vs T2V: unset GAMECRAFT_I2V_IMAGE -> I2V (default image). Set to "" -> T2V.
env_image = os.environ.get("GAMECRAFT_I2V_IMAGE")
if env_image is None:
image_path = DEFAULT_I2V_IMAGE_URL # default: I2V
elif env_image.strip() == "":
image_path = None # T2V
else:
image_path = env_image.strip() # I2V with given URL/path
is_i2v = image_path is not None
prompt = DEFAULT_I2V_PROMPT if is_i2v else DEFAULT_PROMPTS["temple"]
print(f"Mode: {'I2V' if is_i2v else 'T2V'}, prompt: {prompt[:60]}...")
gen_kw = dict(
prompt=prompt,
negative_prompt="",
camera_states=camera_states,
height=height,
width=width,
num_frames=num_frames,
num_inference_steps=50,
guidance_scale=6.0,
seed=42,
fps=24,
output_path=OUTPUT_PATH,
save_video=True,
)
if is_i2v:
gen_kw["image_path"] = image_path
generator.generate_video(**gen_kw)
if __name__ == "__main__":
main()
basic_gen3c.py
"""
GEN3C: 3D-aware camera-controlled video generation.
This example generates a video from a single input image with camera control.
The pipeline uses MoGe depth estimation, 3D point cloud forward warping,
and the GEN3C diffusion model.
Requirements:
1. Install MoGe:
uv pip install git+https://github.com/microsoft/MoGe.git
If you hit `ImportError: libGL.so.1`, install:
sudo apt-get update && sudo apt-get install -y libgl1 libglib2.0-0 libsm6 libxext6 libxrender1
2. Download and convert weights:
huggingface-cli download nvidia/GEN3C-Cosmos-7B --local-dir official_weights/GEN3C-Cosmos-7B
python scripts/checkpoint_conversion/convert_gen3c_to_fastvideo.py \
--source ./official_weights/GEN3C-Cosmos-7B/model.pt \
--output ./converted_weights/GEN3C-Cosmos-7B \
--components-source nvidia/Cosmos-Predict2-2B-Video2World
3. Provide an input image for 3D-conditioned generation.
"""
import argparse
from fastvideo import VideoGenerator
def main():
parser = argparse.ArgumentParser(description="GEN3C video generation")
parser.add_argument("--model_path",
type=str,
default="converted_weights/GEN3C-Cosmos-7B")
parser.add_argument("--image_path",
type=str,
default=None,
help="Input image for 3D cache conditioning")
parser.add_argument("--prompt",
type=str,
default="A slow camera pan over a sunlit landscape.")
parser.add_argument(
"--negative_prompt",
type=str,
default=(
"The video captures a series of frames showing ugly scenes, static with no motion, motion blur, "
"over-saturation, shaky footage, low resolution, grainy texture, pixelated images, poorly lit areas, "
"underexposed and overexposed scenes, poor color balance, washed out colors, choppy sequences, "
"jerky movements, low frame rate, artifacting, color banding, unnatural transitions, outdated special "
"effects, fake elements, unconvincing visuals, poorly edited content, jump cuts, visual noise, and "
"flickering. Overall, the video is of poor quality."
),
)
parser.add_argument("--trajectory",
type=str,
default="left",
choices=[
"left", "right", "up", "down", "zoom_in",
"zoom_out", "clockwise", "counterclockwise", "none"
])
parser.add_argument("--movement_distance", type=float, default=0.3)
parser.add_argument("--camera_rotation",
type=str,
default="center_facing",
choices=[
"center_facing", "no_rotation",
"trajectory_aligned"
])
parser.add_argument("--height", type=int, default=704)
parser.add_argument("--width", type=int, default=1280)
parser.add_argument("--num_frames", type=int, default=121)
parser.add_argument("--num_inference_steps", type=int, default=35)
parser.add_argument("--guidance_scale", type=float, default=1.0)
parser.add_argument("--output_path",
type=str,
default="outputs_video/gen3c.mp4")
parser.add_argument("--seed", type=int, default=42)
args = parser.parse_args()
generator = VideoGenerator.from_pretrained(
args.model_path,
num_gpus=1,
use_fsdp_inference=False,
dit_cpu_offload=False,
vae_cpu_offload=True,
text_encoder_cpu_offload=True,
pin_cpu_memory=True,
)
video = generator.generate_video(
args.prompt,
negative_prompt=args.negative_prompt,
image_path=args.image_path,
trajectory_type=args.trajectory,
movement_distance=args.movement_distance,
camera_rotation=args.camera_rotation,
height=args.height,
width=args.width,
num_frames=args.num_frames,
num_inference_steps=args.num_inference_steps,
guidance_scale=args.guidance_scale,
fps=24,
seed=args.seed,
output_path=args.output_path,
save_video=True,
)
generator.shutdown()
if __name__ == "__main__":
main()
basic_hy15.py
from fastvideo import VideoGenerator
import json
# from fastvideo.api.sampling_param import SamplingParam
OUTPUT_PATH = "video_samples_hy15"
def main():
# FastVideo will automatically use the optimal default arguments for the
# model.
# If a local path is provided, FastVideo will make a best effort
# attempt to identify the optimal arguments.
generator = VideoGenerator.from_pretrained(
"hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-480p_t2v",
# FastVideo will automatically handle distributed setup
num_gpus=1,
use_fsdp_inference=False, # set to True if GPU is out of memory
dit_cpu_offload=True,
vae_cpu_offload=True,
text_encoder_cpu_offload=True,
pin_cpu_memory=True, # set to false if low CPU RAM or hit obscure "CUDA error: Invalid argument"
# image_encoder_cpu_offload=False,
)
prompt = (
"A curious raccoon peers through a vibrant field of yellow sunflowers, its eyes "
"wide with interest. The playful yet serene atmosphere is complemented by soft "
"natural light filtering through the petals. Mid-shot, warm and cheerful tones."
)
video = generator.generate_video(prompt, output_path=OUTPUT_PATH, save_video=True, negative_prompt="", num_frames=81, fps=16)
prompt2 = (
"A majestic lion strides across the golden savanna, its powerful frame "
"glistening under the warm afternoon sun. The tall grass ripples gently in "
"the breeze, enhancing the lion's commanding presence. The tone is vibrant, "
"embodying the raw energy of the wild. Low angle, steady tracking shot, "
"cinematic.")
video2 = generator.generate_video(prompt2, output_path=OUTPUT_PATH, save_video=True, negative_prompt="", num_frames=81, fps=16)
if __name__ == "__main__":
main()
basic_hy15_1080p.py
from fastvideo import VideoGenerator
import json
# from fastvideo.api.sampling_param import SamplingParam
OUTPUT_PATH = "video_samples_hy15_1080p"
def main():
# FastVideo will automatically use the optimal default arguments for the
# model.
# If a local path is provided, FastVideo will make a best effort
# attempt to identify the optimal arguments.
generator = VideoGenerator.from_pretrained(
"weizhou03/HunyuanVideo-1.5-Diffusers-1080p-2SR", # 480p -> 720p -> 1080p
# or "weizhou03/HunyuanVideo-1.5-Diffusers-1080p" # 720p -> 1080p
# FastVideo will automatically handle distributed setup
num_gpus=1,
use_fsdp_inference=False, # set to True if GPU is out of memory
dit_cpu_offload=True,
vae_cpu_offload=True,
text_encoder_cpu_offload=True,
pin_cpu_memory=True, # set to false if low CPU RAM or hit obscure "CUDA error: Invalid argument"
# image_encoder_cpu_offload=False,
)
prompt = (
"A curious raccoon peers through a vibrant field of yellow sunflowers, its eyes "
"wide with interest. The playful yet serene atmosphere is complemented by soft "
"natural light filtering through the petals. Mid-shot, warm and cheerful tones."
)
video = generator.generate_video(prompt, output_path=OUTPUT_PATH, save_video=True, negative_prompt="")
prompt2 = (
"A majestic lion strides across the golden savanna, its powerful frame "
"glistening under the warm afternoon sun. The tall grass ripples gently in "
"the breeze, enhancing the lion's commanding presence. The tone is vibrant, "
"embodying the raw energy of the wild. Low angle, steady tracking shot, "
"cinematic.")
video2 = generator.generate_video(prompt2, output_path=OUTPUT_PATH, save_video=True, negative_prompt="")
if __name__ == "__main__":
main()
basic_hyworld.py
from fastvideo import VideoGenerator
from fastvideo.models.dits.hyworld.resolution_utils import get_resolution_from_image
# Default prompt from HY-WorldPlay run.sh
DEFAULT_PROMPT = 'A paved pathway leads towards a stone arch bridge spanning a calm body of water. Lush green trees and foliage line the path and the far bank of the water. A traditional-style pavilion with a tiered, reddish-brown roof sits on the far shore. The water reflects the surrounding greenery and the sky. The scene is bathed in soft, natural light, creating a tranquil and serene atmosphere. The pathway is composed of large, rectangular stones, and the bridge is constructed of light gray stone. The overall composition emphasizes the peaceful and harmonious nature of the landscape.'
DEFAULT_IMAGE = 'https://raw.githubusercontent.com/Tencent-Hunyuan/HY-WorldPlay/main/assets/img/test.png'
OUTPUT_PATH = "video_samples_hyworld"
def main():
import argparse
# pose: (a, w, s, d) - (15, 31)
# num_frames: (61, 125)
parser = argparse.ArgumentParser(description="HYWorld video generation with FastVideo")
parser.add_argument("--prompt", type=str, default=DEFAULT_PROMPT, help="Text prompt for video generation")
parser.add_argument("--image", type=str, default=DEFAULT_IMAGE, help="Path or URL to input image")
parser.add_argument("--pose", type=str, default='w-31', help="Pose string (e.g., 'a-31', 'w-31', 's-31', 'd-31')")
parser.add_argument("--output_path", type=str, default=OUTPUT_PATH, help="Output video path")
parser.add_argument("--num-frames", type=int, default=125, help="Number of frames")
parser.add_argument("--seed", type=int, default=1, help="Random seed")
parser.add_argument("--resolution", type=str, default="480p", help="Only support 480p for now")
args = parser.parse_args()
# Automatically determine resolution from input image
HEIGHT, WIDTH = get_resolution_from_image(args.image, args.resolution)
print(f"Image: {args.image}")
print(f"Pose: {args.pose}")
print(f"Resolution: {HEIGHT}x{WIDTH} (from {args.resolution} buckets)")
print(f"Num frames: {args.num_frames}")
print(f"Output path: {args.output_path}")
# Initialize generator
print("\nInitializing VideoGenerator for HYWorld...")
generator = VideoGenerator.from_pretrained(
"FastVideo/HY-WorldPlay-Bidirectional-Diffusers",
num_gpus=1,
use_fsdp_inference=True,
dit_cpu_offload=True,
vae_cpu_offload=True,
text_encoder_cpu_offload=True,
pin_cpu_memory=True,
image_encoder_cpu_offload=True,
)
# Generate video
# The pose string is automatically converted to camera matrices by the pipeline
print("\nGenerating video...")
generator.generate_video(
prompt=args.prompt,
image_path=args.image,
pose=args.pose, # Camera trajectory control
output_path=args.output_path,
save_video=True,
negative_prompt="",
num_frames=args.num_frames,
fps=24,
height=HEIGHT,
width=WIDTH,
seed=args.seed,
)
print(f"\nVideo saved to: {args.output_path}")
if __name__ == "__main__":
main()
basic_lingbotworld_base_cam.py
from fastvideo import VideoGenerator
from fastvideo.models.dits.lingbotworld.cam_utils import prepare_camera_embedding
# from fastvideo.api.sampling_param import SamplingParam
OUTPUT_PATH = "video_samples_lingbotworld"
def main():
# FastVideo will automatically use the optimal default arguments for the
# model.
# If a local path is provided, FastVideo will make a best effort
# attempt to identify the optimal arguments.
generator = VideoGenerator.from_pretrained(
"FastVideo/LingBot-World-Base-Cam-Diffusers",
# FastVideo will automatically handle distributed setup
num_gpus=1,
use_fsdp_inference=False, # set to True if GPU is out of memory
dit_cpu_offload=True, # DiT need to be offloaded for MoE
vae_cpu_offload=False,
text_encoder_cpu_offload=True,
# Set pin_cpu_memory to false if CPU RAM is limited and there're no frequent CPU-GPU transfer
pin_cpu_memory=True,
# image_encoder_cpu_offload=False,
)
num_frames = 81
prompt = "The video presents a soaring journey through a fantasy jungle. The wind whips past the rider's blue hands gripping the reins, causing the leather straps to vibrate. The ancient gothic castle approaches steadily, its stone details becoming clearer against the backdrop of floating islands and distant waterfalls."
image_path = "https://raw.githubusercontent.com/Robbyant/lingbot-world/main/examples/00/image.jpg"
action_path = "examples/inference/basic/lingbotworld_examples/00"
c2ws_plucker_emb, num_frames = prepare_camera_embedding(
action_path=action_path,
num_frames=num_frames,
height=480,
width=832,
spatial_scale=8,
)
generator.generate_video(
prompt,
image_path=image_path,
output_path=OUTPUT_PATH,
save_video=True,
num_frames=num_frames,
height=480,
width=832,
c2ws_plucker_emb=c2ws_plucker_emb,
)
if __name__ == "__main__":
main()
basic_longcat_i2v.py
"""
LongCat Image-to-Video (I2V) Example Script
This script demonstrates LongCat I2V inference using the FastVideo Python API.
LongCat I2V takes an input image and generates a video from it.
It runs both basic generation (50 steps) and distill+refine generation
(16 steps distill + 50 steps refinement to 720p with BSA).
Usage:
python examples/inference/basic/basic_longcat_i2v.py
Note:
Refinement uses 768x768 dimensions where latent (48x48) is divisible by 8,
compatible with BSA chunks [4, 4, 8].
"""
import glob
import os
from fastvideo import VideoGenerator
# Common prompts and settings matching the shell script examples
PROMPT = (
"A woman sits at a wooden table by the window in a cozy café. She reaches out "
"with her right hand, picks up the white coffee cup from the saucer, and gently "
"brings it to her lips to take a sip. After drinking, she places the cup back on "
"the table and looks out the window, enjoying the peaceful atmosphere."
)
NEGATIVE_PROMPT = (
"Bright tones, overexposed, static, blurred details, subtitles, style, works, "
"paintings, images, static, overall gray, worst quality, low quality, JPEG compression "
"residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, "
"deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, "
"three legs, many people in the background, walking backwards"
)
# Input image path
IMAGE_PATH = "assets/girl.png"
SEED = 42
def basic_generation():
"""
Run basic LongCat I2V generation (50 steps at 480p).
This uses the full 50-step denoising process for highest quality.
"""
print("=" * 60)
print("LongCat I2V: Basic Generation (50 steps, 480p)")
print("=" * 60)
generator = VideoGenerator.from_pretrained(
"FastVideo/LongCat-Video-I2V-Diffusers",
num_gpus=1,
use_fsdp_inference=False, # set to True if GPU is out of memory
dit_cpu_offload=False,
vae_cpu_offload=True,
text_encoder_cpu_offload=True,
pin_cpu_memory=False,
enable_bsa=False,
)
output_path = "outputs_video/longcat_i2v_basic"
generator.generate_video(
prompt=PROMPT,
negative_prompt=NEGATIVE_PROMPT,
image_path=IMAGE_PATH,
output_path=output_path,
save_video=True,
height=480,
width=480, # Square
num_frames=93,
num_inference_steps=50,
fps=15,
guidance_scale=4.0,
seed=SEED,
)
print(f"\nBasic generation complete! Video saved to: {output_path}")
generator.shutdown()
def distill_refine_generation():
"""
Run LongCat I2V with distill+refine pipeline (16 steps + refinement to 768p).
This uses the distilled LoRA for fast 480p generation (16 steps),
then refines to 768p using the refinement LoRA with BSA enabled.
"""
print("\n" + "=" * 60)
print("LongCat I2V: Distill + Refine Pipeline")
print("=" * 60)
# Stage 1: Distilled generation (16 steps at 480p)
print("\n[Stage 1] Distilled generation (16 steps, 480p)")
print("-" * 40)
generator = VideoGenerator.from_pretrained(
"FastVideo/LongCat-Video-I2V-Diffusers",
num_gpus=1,
use_fsdp_inference=True,
dit_cpu_offload=False,
vae_cpu_offload=True,
text_encoder_cpu_offload=True,
pin_cpu_memory=False,
enable_bsa=False,
lora_path="FastVideo/LongCat-Video-T2V-Distilled-LoRA",
lora_nickname="distilled",
)
distill_output_path = "outputs_video/longcat_i2v_distill"
generator.generate_video(
prompt=PROMPT,
negative_prompt=NEGATIVE_PROMPT,
image_path=IMAGE_PATH,
output_path=distill_output_path,
save_video=True,
height=480,
width=480, # Square
num_frames=93,
num_inference_steps=16,
fps=15,
guidance_scale=1.0,
seed=SEED,
)
print(f"Distilled generation complete! Video saved to: {distill_output_path}")
generator.shutdown()
# Stage 2: Refinement (480p -> 768p)
print("\n[Stage 2] Refinement (480p -> 768p with BSA)")
print("-" * 40)
# Find the actual saved video file from stage 1
video_files = glob.glob(os.path.join(distill_output_path, "*.mp4"))
if not video_files:
raise FileNotFoundError(f"No video file found in {distill_output_path}")
# Use the most recently created video file
distill_video_path = max(video_files, key=os.path.getmtime)
print(f"Using stage 1 video: {distill_video_path}")
# Create a new generator with refinement LoRA and BSA enabled
# Note: Refinement uses the T2V model (not I2V) since it's upscaling the generated video
# For BSA [4, 4, 8]: latent must be divisible by 8
# 768x768: latent 48x48, 48%8=0 ✓
refine_generator = VideoGenerator.from_pretrained(
"FastVideo/LongCat-Video-T2V-Diffusers",
num_gpus=1,
use_fsdp_inference=True,
dit_cpu_offload=True,
vae_cpu_offload=True,
text_encoder_cpu_offload=True,
pin_cpu_memory=False,
enable_bsa=True,
bsa_sparsity=0.875,
bsa_chunk_q=[4, 4, 4],
bsa_chunk_k=[4, 4, 4],
lora_path="FastVideo/LongCat-Video-T2V-Refinement-LoRA",
lora_nickname="refinement",
)
refine_output_path = "outputs_video/longcat_i2v_refine_720p"
refine_generator.generate_video(
prompt=PROMPT,
negative_prompt=NEGATIVE_PROMPT,
output_path=refine_output_path,
save_video=True,
refine_from=distill_video_path,
t_thresh=0.5,
spatial_refine_only=False,
num_cond_frames=0,
height=720,
width=720,
num_inference_steps=50,
fps=30,
guidance_scale=1.0,
seed=SEED,
)
print(f"Refinement complete! Video saved to: {refine_output_path}")
refine_generator.shutdown()
def main():
"""Run both basic and distill+refine generation pipelines."""
print("\n" + "=" * 60)
print("LongCat Image-to-Video Example")
print("=" * 60 + "\n")
# Run basic generation
basic_generation()
# Run distill+refine pipeline
distill_refine_generation()
print("\n" + "=" * 60)
print("All generations complete!")
print("=" * 60)
if __name__ == "__main__":
main()
basic_longcat_t2v.py
"""
LongCat Text-to-Video (T2V) Example Script
This script demonstrates LongCat T2V inference using the FastVideo Python API.
It runs both basic generation (50 steps) and distill+refine generation
(16 steps distill + 50 steps refinement to 720p).
Usage:
python examples/inference/basic/basic_longcat_t2v.py
"""
import glob
import os
from fastvideo import VideoGenerator
# Common prompts and settings matching the shell script examples
PROMPT = (
"In a realistic photography style, a white boy around seven or eight years old "
"sits on a park bench, wearing a light blue T-shirt, denim shorts, and white sneakers. "
"He holds an ice cream cone with vanilla and chocolate flavors, and beside him is a "
"medium-sized golden Labrador. Smiling, the boy offers the ice cream to the dog, "
"who eagerly licks it with its tongue. The sun is shining brightly, and the background "
"features a green lawn and several tall trees, creating a warm and loving scene."
)
NEGATIVE_PROMPT = (
"Bright tones, overexposed, static, blurred details, subtitles, style, works, "
"paintings, images, static, overall gray, worst quality, low quality, JPEG compression "
"residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, "
"deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, "
"three legs, many people in the background, walking backwards"
)
SEED = 42
def basic_generation():
"""
Run basic LongCat T2V generation (50 steps at 480p).
This uses the full 50-step denoising process for highest quality.
"""
print("=" * 60)
print("LongCat T2V: Basic Generation (50 steps, 480p)")
print("=" * 60)
generator = VideoGenerator.from_pretrained(
"FastVideo/LongCat-Video-T2V-Diffusers",
num_gpus=1,
use_fsdp_inference=False, # set to True if GPU is out of memory
dit_cpu_offload=False,
vae_cpu_offload=True,
text_encoder_cpu_offload=True,
pin_cpu_memory=False,
enable_bsa=False,
)
output_path = "outputs_video/longcat_t2v_basic"
generator.generate_video(
prompt=PROMPT,
negative_prompt=NEGATIVE_PROMPT,
output_path=output_path,
save_video=True,
height=480,
width=832,
num_frames=93,
num_inference_steps=50,
fps=15,
guidance_scale=4.0,
seed=SEED,
)
print(f"\nBasic generation complete! Video saved to: {output_path}")
generator.shutdown()
def distill_refine_generation():
"""
Run LongCat T2V with distill+refine pipeline (16 steps + refinement to 720p).
This uses the distilled LoRA for fast 480p generation (16 steps),
then refines to 720p using the refinement LoRA with BSA enabled.
"""
print("\n" + "=" * 60)
print("LongCat T2V: Distill + Refine Pipeline")
print("=" * 60)
# Stage 1: Distilled generation (16 steps at 480p)
print("\n[Stage 1] Distilled generation (16 steps, 480p)")
print("-" * 40)
generator = VideoGenerator.from_pretrained(
"FastVideo/LongCat-Video-T2V-Diffusers",
num_gpus=1,
use_fsdp_inference=True,
dit_cpu_offload=False,
vae_cpu_offload=True,
text_encoder_cpu_offload=True,
pin_cpu_memory=False,
enable_bsa=False,
lora_path="FastVideo/LongCat-Video-T2V-Distilled-LoRA",
lora_nickname="distilled",
)
distill_output_path = "outputs_video/longcat_t2v_distill"
generator.generate_video(
prompt=PROMPT,
negative_prompt=NEGATIVE_PROMPT,
output_path=distill_output_path,
save_video=True,
height=480,
width=832,
num_frames=93,
num_inference_steps=16,
fps=15,
guidance_scale=1.0,
seed=SEED,
)
print(f"Distilled generation complete! Video saved to: {distill_output_path}")
generator.shutdown()
# Stage 2: Refinement (480p -> 720p)
print("\n[Stage 2] Refinement (480p -> 720p with BSA)")
print("-" * 40)
# Find the actual saved video file from stage 1
video_files = glob.glob(os.path.join(distill_output_path, "*.mp4"))
if not video_files:
raise FileNotFoundError(f"No video file found in {distill_output_path}")
# Use the most recently created video file
distill_video_path = max(video_files, key=os.path.getmtime)
print(f"Using stage 1 video: {distill_video_path}")
# Create a new generator with refinement LoRA and BSA enabled
refine_generator = VideoGenerator.from_pretrained(
"FastVideo/LongCat-Video-T2V-Diffusers",
num_gpus=1,
use_fsdp_inference=True,
dit_cpu_offload=True,
vae_cpu_offload=True,
text_encoder_cpu_offload=True,
pin_cpu_memory=False,
enable_bsa=True,
bsa_sparsity=0.875,
bsa_chunk_q=[4, 4, 8],
bsa_chunk_k=[4, 4, 8],
lora_path="FastVideo/LongCat-Video-T2V-Refinement-LoRA",
lora_nickname="refinement",
)
refine_output_path = "outputs_video/longcat_t2v_refine_720p"
refine_generator.generate_video(
prompt=PROMPT,
negative_prompt=NEGATIVE_PROMPT,
output_path=refine_output_path,
save_video=True,
refine_from=distill_video_path,
t_thresh=0.5,
spatial_refine_only=False,
num_cond_frames=0,
height=720,
width=1280,
num_inference_steps=50,
fps=30,
guidance_scale=1.0,
seed=SEED,
)
print(f"Refinement complete! Video saved to: {refine_output_path}")
refine_generator.shutdown()
def main():
"""Run both basic and distill+refine generation pipelines."""
print("\n" + "=" * 60)
print("LongCat Text-to-Video Example")
print("=" * 60 + "\n")
# Run basic generation
basic_generation()
# Run distill+refine pipeline
distill_refine_generation()
print("\n" + "=" * 60)
print("All generations complete!")
print("=" * 60)
if __name__ == "__main__":
main()
basic_longcat_vc.py
"""
LongCat Video Continuation (VC) Example Script
This script demonstrates LongCat VC inference using the FastVideo Python API.
LongCat VC takes an input video and generates a continuation of it.
It runs both basic generation (50 steps) and distill+refine generation
(16 steps distill + 50 steps refinement to 720p).
Usage:
python examples/inference/basic/basic_longcat_vc.py
Prerequisites:
- Ensure the input video exists at assets/motorcycle.mp4
(or provide your own video)
"""
import glob
import os
from fastvideo import VideoGenerator
# Common prompts and settings matching the shell script examples
PROMPT = (
"A person rides a motorcycle along a long, straight road that stretches between "
"a body of water and a forested hillside. The rider steadily accelerates, keeping "
"the motorcycle centered between the guardrails, while the scenery passes by on "
"both sides. The video captures the journey from the rider's perspective, emphasizing "
"the sense of motion and adventure."
)
NEGATIVE_PROMPT = (
"Bright tones, overexposed, static, blurred details, subtitles, style, works, "
"paintings, images, static, overall gray, worst quality, low quality, JPEG compression "
"residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, "
"deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, "
"three legs, many people in the background, walking backwards"
)
# Input video path
VIDEO_PATH = "assets/motorcycle.mp4"
# Number of conditioning frames from the input video
NUM_COND_FRAMES = 13
SEED = 42
def basic_generation():
"""
Run basic LongCat VC generation (50 steps at 480p).
This uses the full 50-step denoising process for highest quality.
"""
print("=" * 60)
print("LongCat VC: Basic Generation (50 steps, 480p)")
print("=" * 60)
# Check if video exists
if not os.path.exists(VIDEO_PATH):
raise FileNotFoundError(
f"Video not found at {VIDEO_PATH}. "
"Please provide a valid video path."
)
generator = VideoGenerator.from_pretrained(
"FastVideo/LongCat-Video-VC-Diffusers",
num_gpus=1,
use_fsdp_inference=False, # set to True if GPU is out of memory
dit_cpu_offload=False,
vae_cpu_offload=True,
text_encoder_cpu_offload=True,
pin_cpu_memory=False,
enable_bsa=False,
)
output_path = "outputs_video/longcat_vc_basic"
generator.generate_video(
prompt=PROMPT,
negative_prompt=NEGATIVE_PROMPT,
video_path=VIDEO_PATH,
num_cond_frames=NUM_COND_FRAMES,
output_path=output_path,
save_video=True,
height=480,
width=832,
num_frames=93,
num_inference_steps=50,
fps=15,
guidance_scale=4.0,
seed=SEED,
)
print(f"\nBasic generation complete! Video saved to: {output_path}")
generator.shutdown()
def distill_refine_generation():
"""
Run LongCat VC with distill+refine pipeline (16 steps + refinement to 720p).
This uses the distilled LoRA for fast 480p generation (16 steps),
then refines to 720p using the refinement LoRA with BSA enabled.
"""
print("\n" + "=" * 60)
print("LongCat VC: Distill + Refine Pipeline")
print("=" * 60)
# Check if video exists
if not os.path.exists(VIDEO_PATH):
raise FileNotFoundError(
f"Video not found at {VIDEO_PATH}. "
"Please provide a valid video path."
)
# Stage 1: Distilled generation (16 steps at 480p)
print("\n[Stage 1] Distilled generation (16 steps, 480p)")
print("-" * 40)
generator = VideoGenerator.from_pretrained(
"FastVideo/LongCat-Video-VC-Diffusers",
num_gpus=1,
use_fsdp_inference=True,
dit_cpu_offload=False,
vae_cpu_offload=True,
text_encoder_cpu_offload=True,
pin_cpu_memory=False,
enable_bsa=False,
lora_path="FastVideo/LongCat-Video-T2V-Distilled-LoRA",
lora_nickname="distilled",
)
distill_output_path = "outputs_video/longcat_vc_distill"
generator.generate_video(
prompt=PROMPT,
negative_prompt=NEGATIVE_PROMPT,
video_path=VIDEO_PATH,
num_cond_frames=NUM_COND_FRAMES,
output_path=distill_output_path,
save_video=True,
height=480,
width=832,
num_frames=93,
num_inference_steps=16,
fps=15,
guidance_scale=1.0,
seed=SEED,
)
print(f"Distilled generation complete! Video saved to: {distill_output_path}")
generator.shutdown()
# Stage 2: Refinement (480p -> 720p)
print("\n[Stage 2] Refinement (480p -> 720p with BSA)")
print("-" * 40)
# Find the actual saved video file from stage 1
video_files = glob.glob(os.path.join(distill_output_path, "*.mp4"))
if not video_files:
raise FileNotFoundError(f"No video file found in {distill_output_path}")
# Use the most recently created video file
distill_video_path = max(video_files, key=os.path.getmtime)
print(f"Using stage 1 video: {distill_video_path}")
# Create a new generator with refinement LoRA and BSA enabled
# Note: Refinement uses the T2V model (not VC) since it's upscaling the generated video
refine_generator = VideoGenerator.from_pretrained(
"FastVideo/LongCat-Video-T2V-Diffusers",
num_gpus=1,
use_fsdp_inference=True,
dit_cpu_offload=True,
vae_cpu_offload=True,
text_encoder_cpu_offload=True,
pin_cpu_memory=False,
enable_bsa=True,
bsa_sparsity=0.875,
bsa_chunk_q=[4, 4, 8],
bsa_chunk_k=[4, 4, 8],
lora_path="FastVideo/LongCat-Video-T2V-Refinement-LoRA",
lora_nickname="refinement",
)
refine_output_path = "outputs_video/longcat_vc_refine_720p"
refine_generator.generate_video(
prompt=PROMPT,
negative_prompt=NEGATIVE_PROMPT,
output_path=refine_output_path,
save_video=True,
refine_from=distill_video_path,
t_thresh=0.5,
spatial_refine_only=False,
num_cond_frames=0, # For refinement, no conditioning frames
height=720,
width=1280,
num_inference_steps=50,
fps=30,
guidance_scale=1.0,
seed=SEED,
)
print(f"Refinement complete! Video saved to: {refine_output_path}")
refine_generator.shutdown()
def main():
"""Run both basic and distill+refine generation pipelines."""
print("\n" + "=" * 60)
print("LongCat Video Continuation Example")
print("=" * 60 + "\n")
# Run basic generation
basic_generation()
# Run distill+refine pipeline
distill_refine_generation()
print("\n" + "=" * 60)
print("All generations complete!")
print("=" * 60)
if __name__ == "__main__":
main()
basic_ltx2.py
from fastvideo import VideoGenerator
PROMPT = (
"A warm sunny backyard. The camera starts in a tight cinematic close-up "
"of a woman and a man in their 30s, facing each other with serious "
"expressions. The woman, emotional and dramatic, says softly, \"That's "
"it... Dad's lost it. And we've lost Dad.\" The man exhales, slightly "
"annoyed: \"Stop being so dramatic, Jess.\" A beat. He glances aside, "
"then mutters defensively, \"He's just having fun.\" The camera slowly "
"pans right, revealing the grandfather in the garden wearing enormous "
"butterfly wings, waving his arms in the air like he's trying to take "
"off. He shouts, \"Wheeeew!\" as he flaps his wings with full commitment. "
"The woman covers her face, on the verge of tears. The tone is deadpan, "
"absurd, and quietly tragic."
)
def main() -> None:
# Uses FastVideo default sampling settings for LTX2 base.
generator = VideoGenerator.from_pretrained(
"Davids048/LTX2-Base-Diffusers",
num_gpus=1,
)
output_path = "outputs_video/ltx2_basic/output_ltx2_base_t2v_1088_1920_1.1.mp4"
generator.generate_video(
prompt=PROMPT,
output_path=output_path,
save_video=True,
num_frames=121,
height=1088,
width=1920,
)
generator.shutdown()
if __name__ == "__main__":
main()
basic_ltx2_3_distilled_i2v.py
# SPDX-License-Identifier: Apache-2.0
"""LTX-2.3 distilled image-to-video with torch.compile + timing breakdown.
This example runs the LTX-2.3 distilled student model on a single GPU with
torch.compile fully enabled, then prints a per-stage timing breakdown so the
user can see where wall-time goes. It is meant as the canonical entry point
for trying out the LTX-2.3 i2v path on `hao-ai-lab/FastVideo:main`.
Quick start
-----------
export LTX23_I2V_IMAGE=/path/to/your/portrait_or_product.jpg
# optional overrides:
# export LTX23_I2V_PROMPT="a fashion model walks toward camera..."
# export LTX23_OUTPUT_DIR=outputs_video/ltx2_3_distilled_i2v
python examples/inference/basic/basic_ltx2_3_distilled_i2v.py
What the script does
--------------------
1. Loads FastVideo/LTX-2.3-Distilled-Diffusers (8 denoise + 3 refine steps,
CFG=1, no refine LoRA — the distilled production recipe).
2. Compiles the DiT, text encoder, and VAE (fullgraph, Inductor default
mode — autotune adds ~7 min cold-compile here with no measurable
e2e gain).
3. Runs 2 warmup calls (untimed) + 2 measured calls. Two warmups are kept
as a safety net — the first call pays cold compile + first-shape guard
work, and a second warmup ensures any residual recompiles settle before
we measure.
4. Prints a per-stage breakdown and an average over the measured runs.
Hardware notes
--------------
- Single-GPU example; for multi-GPU sequence-parallel see the gradio demo
under `examples/inference/gradio/local/gradio_local_demo_ltx2_3/`.
- First-time compile takes ~30-40 min on GB200 (~20 min on H100; cached
in `$TORCHINDUCTOR_CACHE_DIR` afterwards). Subsequent invocations only
pay the one-time process load + a few seconds of dynamo trace.
- On GB200 / Blackwell, run with `env -u LD_LIBRARY_PATH ...` to avoid a
system-cuBLAS / torch-cuBLAS mismatch that fails every GEMM. The
`_inductor.shape_padding = False` line below also avoids a pad_mm
landmine on the same generation of cards.
"""
from __future__ import annotations
import os
import time
from collections import OrderedDict
from pathlib import Path
import torch._inductor.config as _inductor
from fastvideo import VideoGenerator
from fastvideo.configs.pipelines.base import PipelineConfig
from fastvideo.utils import maybe_download_model
# Env knobs (set BEFORE importing fastvideo where possible — but
# FASTVIDEO_ATTENTION_BACKEND is fine here because the worker reads it
# on generator construction).
os.environ.setdefault("FASTVIDEO_ATTENTION_BACKEND", "FLASH_ATTN")
os.environ.setdefault("FASTVIDEO_STAGE_LOGGING", "1")
# Inductor knobs. The first one (shape_padding=False) is mandatory on
# Blackwell to avoid a cuBLAS INVALID_VALUE crash inside pad_mm during
# the refine path. The rest are autotune-friendliness flags.
_inductor.shape_padding = False
_inductor.conv_1x1_as_mm = True
_inductor.coordinate_descent_tuning = True
_inductor.coordinate_descent_check_all_directions = True
_inductor.epilogue_fusion = False
MODEL_ID = os.path.expandvars(
os.path.expanduser(
os.getenv("LTX23_MODEL_PATH", "FastVideo/LTX-2.3-Distilled-Diffusers")
)
)
OUTPUT_DIR = Path(
os.getenv("LTX23_OUTPUT_DIR", "outputs_video/ltx2_3_distilled_i2v")
)
I2V_IMAGE = os.getenv("LTX23_I2V_IMAGE", "")
DEFAULT_PROMPT = (
"A fashion model takes a slow step forward and shifts her weight, "
"the soft fabric of her clothing swaying and rippling with the "
"motion, her hair shifting gently, soft even studio lighting on a "
"clean light background, elegant slow-motion runway feel."
)
PROMPT = os.getenv("LTX23_I2V_PROMPT", DEFAULT_PROMPT)
# Per-stage timing helpers --------------------------------------------------
def _print_stage_breakdown(result: dict, label: str) -> float | None:
"""Print stage execution times and return the sum, or None if missing."""
logging_info = result.get("logging_info")
stages = getattr(logging_info, "stages", None) if logging_info else None
if not stages:
print(f" [{label}] stage breakdown unavailable")
return None
print(f" [{label}] stage breakdown:")
total = 0.0
for name, metrics in stages.items():
exec_s = float(metrics.get("execution_time", 0.0))
total += exec_s
print(f" - {name}: {exec_s:.3f}s")
print(f" - stage_sum: {total:.3f}s")
return total
def _collect_stage_times(
result: dict,
stage_times: dict[str, list[float]],
stage_order: OrderedDict[str, None],
) -> None:
logging_info = result.get("logging_info")
stages = getattr(logging_info, "stages", None) if logging_info else None
if not stages:
return
for name, metrics in stages.items():
stage_order.setdefault(name, None)
stage_times.setdefault(name, []).append(
float(metrics.get("execution_time", 0.0))
)
def _resolve_refine_upsampler(model_root: str) -> Path:
"""LTX-2.3 distilled snapshots ship a `spatial_upscaler/` subdir."""
for name in ("spatial_upscaler", "spatial_upsampler"):
cand = Path(model_root) / name
if (cand / "config.json").is_file():
return cand
raise FileNotFoundError(
f"No refine upsampler directory under {model_root}. "
f"Expected `{model_root}/spatial_upscaler/config.json`."
)
# Main ---------------------------------------------------------------------
def main() -> None:
if not I2V_IMAGE:
raise SystemExit(
"LTX23_I2V_IMAGE is required for i2v. Example:\n"
" export LTX23_I2V_IMAGE=/path/to/portrait_or_product.jpg\n"
" python examples/inference/basic/basic_ltx2_3_distilled_i2v.py"
)
if not Path(I2V_IMAGE).is_file():
raise SystemExit(f"LTX23_I2V_IMAGE not found: {I2V_IMAGE}")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
model_root = maybe_download_model(MODEL_ID)
refine_upsampler_path = _resolve_refine_upsampler(model_root)
print(f"Model: {model_root}")
print(f"Refine upsampler: {refine_upsampler_path}")
print(f"i2v image: {I2V_IMAGE}")
print(f"Output dir: {OUTPUT_DIR.resolve()}")
# mode="default" — Inductor's default schedule matches max-autotune on
# this pipeline (denoise/refine/decode all within ~5 ms, n=2) while
# saving ~7 min of cold compile on a single GB200.
torch_compile_kwargs = {
"backend": "inductor",
"fullgraph": True,
"mode": "default",
"dynamic": False,
}
# Loading the pipeline config *with model_path* binds model-specific
# tuning (notably VAE precision/decoder defaults) into the config. Without
# this, the generic pipeline config gives a substantially slower VAE
# decode stage. `basic_ltx2_distilled_fast_profile.py` uses the same
# pattern.
pipeline_config = PipelineConfig.from_pretrained(model_root)
pipeline_config.dit_config.quant_config = None
generator = VideoGenerator.from_pretrained(
model_root,
num_gpus=1,
# LTX-2.3 distilled uses the two-stage refine pipeline; the refine
# LoRA is intentionally empty for the distilled student.
ltx2_refine_enabled=True,
ltx2_refine_upsampler_path=str(refine_upsampler_path),
ltx2_refine_lora_path="",
ltx2_refine_num_inference_steps=3,
ltx2_refine_guidance_scale=1.0,
ltx2_refine_add_noise=True,
pipeline_config=pipeline_config,
enable_torch_compile=True,
enable_torch_compile_text_encoder=True,
# Compile the VAE codec submodules (encoder / decoder) too. The
# `LTX2CausalVideoAutoencoder` declares `_compile_conditions` so
# `_compile_with_conditions` targets just those submodules and
# leaves the surrounding tiling control flow eager — needed for
# fullgraph + dynamic=False to succeed. VAE eager decode is
# ~1.0s; compiling it brings the stage to ~0.3s.
enable_torch_compile_vae=True,
torch_compile_kwargs=torch_compile_kwargs,
torch_compile_kwargs_vae=torch_compile_kwargs,
# Keep everything resident — no CPU offload for serving-style runs.
dit_cpu_offload=False,
text_encoder_cpu_offload=False,
vae_cpu_offload=False,
ltx2_vae_tiling=False,
)
common_kwargs = dict(
prompt=PROMPT,
negative_prompt="", # distilled is CFG-free; no negative needed
guidance_scale=1.0, # CFG=1 for distilled
height=1280, width=832, # portrait runway aspect
num_frames=121, fps=24, # ~5s clip
num_inference_steps=8, # distilled denoise steps
# i2v: anchor the input image at frame 0 with full strength.
# `ltx2_image_crf=0.0` skips an extra JPEG re-encode of an already
# JPEG conditioning image.
ltx2_images=[(I2V_IMAGE, 0, 1.0)],
ltx2_image_crf=0.0,
save_video=True,
)
warmup_runs = 2
measured_runs = 2
warmup_secs: list[float] = []
measured_secs: list[float] = []
stage_times: dict[str, list[float]] = {}
stage_order: OrderedDict[str, None] = OrderedDict()
try:
# Warmup: untimed (but we still wall-clock them so the first compile
# cost is visible to the reader).
for w in range(warmup_runs):
t0 = time.perf_counter()
print(f"\n[warmup {w + 1}/{warmup_runs}] compiling + generating…")
generator.generate_video(
output_path=str(OUTPUT_DIR / f"_warmup_{w + 1}.mp4"),
seed=7,
**common_kwargs,
)
dt = time.perf_counter() - t0
warmup_secs.append(dt)
print(f"[warmup {w + 1}/{warmup_runs}] wall={dt:.1f}s")
# Cleanup warmup artifacts so the user only sees measured outputs.
for w in range(warmup_runs):
(OUTPUT_DIR / f"_warmup_{w + 1}.mp4").unlink(missing_ok=True)
# Measured.
for m in range(measured_runs):
out_path = OUTPUT_DIR / f"output_ltx2_3_distilled_i2v_run_{m + 1}.mp4"
print(f"\n[measured {m + 1}/{measured_runs}] generating: {out_path}")
t0 = time.perf_counter()
result = generator.generate_video(
output_path=str(out_path),
seed=2002 + m,
**common_kwargs,
)
wall = time.perf_counter() - t0
e2e = (
result.get("e2e_latency")
if isinstance(result, dict) else None
) or wall
measured_secs.append(e2e)
print(f"[measured {m + 1}/{measured_runs}] e2e={e2e:.2f}s wall={wall:.2f}s")
if isinstance(result, dict):
_print_stage_breakdown(result, f"measured {m + 1}")
_collect_stage_times(result, stage_times, stage_order)
# Summary.
print("\n=== summary ===")
print(f"warmup wall-times: {[round(x, 1) for x in warmup_secs]}")
if measured_secs:
avg = sum(measured_secs) / len(measured_secs)
print(
f"measured e2e (n={len(measured_secs)}): "
f"{[round(x, 2) for x in measured_secs]} -> avg {avg:.2f}s"
)
if stage_times:
print(f"average stage times over {measured_runs} measured runs:")
avg_total = 0.0
for name in stage_order:
vals = stage_times.get(name) or []
if not vals:
continue
avg_v = sum(vals) / len(vals)
avg_total += avg_v
print(f" - {name}: {avg_v:.3f}s")
print(f" - stage_sum_avg: {avg_total:.3f}s")
finally:
generator.shutdown()
if __name__ == "__main__":
main()
basic_ltx2_3_distilled_i2v_typed.py
# SPDX-License-Identifier: Apache-2.0
"""LTX-2.3 distilled image-to-video — typed API (``from_config`` / ``generate``).
Identical generation behavior to ``basic_ltx2_3_distilled_i2v.py``, but
expressed through the newer typed surface (``GeneratorConfig`` /
``GenerationRequest``) instead of the ``from_pretrained(**legacy_kwargs)``
bridge. The typed API is now the preferred entry point — the legacy
example still works but emits a ``DeprecationWarning`` for the LTX-2.3
specific knobs.
Quick start
-----------
export LTX23_I2V_IMAGE=/path/to/your/portrait_or_product.jpg
# optional overrides:
# export LTX23_I2V_PROMPT="a fashion model walks toward camera..."
# export LTX23_OUTPUT_DIR=outputs_video/ltx2_3_distilled_i2v_typed
python examples/inference/basic/basic_ltx2_3_distilled_i2v_typed.py
What the script does
--------------------
1. Loads FastVideo/LTX-2.3-Distilled-Diffusers (8 denoise + 3 refine
steps, CFG=1, no refine LoRA — the distilled production recipe).
2. Compiles the DiT, text encoder, and VAE (fullgraph, Inductor default
mode — autotune adds ~7 min cold-compile here with no measurable
e2e gain).
3. Runs 2 warmup calls (untimed) + 2 measured calls. Two warmups are
kept as a safety net — the first call pays cold compile + first-shape
guard work, and a second warmup ensures any residual recompiles
settle before we measure.
4. Prints a per-stage breakdown and an average over the measured runs.
Hardware notes
--------------
- Single-GPU example; for multi-GPU sequence-parallel see the gradio
demo under ``examples/inference/gradio/local/gradio_local_demo_ltx2_3/``.
- First-time compile takes ~30-40 min on GB200 (~20 min on H100;
cached in ``$TORCHINDUCTOR_CACHE_DIR`` afterwards). Subsequent
invocations only pay the one-time process load + a few seconds of
dynamo trace.
- On GB200 / Blackwell, run with ``env -u LD_LIBRARY_PATH ...`` to
avoid a system-cuBLAS / torch-cuBLAS mismatch that fails every GEMM.
The ``_inductor.shape_padding = False`` line below also avoids a
``pad_mm`` landmine on the same generation of cards.
Typed-API mapping (legacy kwarg ↔ typed field)
----------------------------------------------
- ``num_gpus`` ↔ ``engine.num_gpus``
- ``enable_torch_compile`` ↔ ``engine.compile.enabled``
- ``enable_torch_compile_text_encoder`` ↔ ``engine.compile.text_encoder_enabled``
- ``enable_torch_compile_vae`` ↔ ``engine.compile.vae_enabled``
- ``torch_compile_kwargs`` ↔ ``engine.compile.backend/fullgraph/mode/dynamic``
- ``torch_compile_kwargs_vae`` ↔ empty ``compile.vae_kwargs`` (inherits master)
- ``dit_cpu_offload`` ↔ ``engine.offload.dit``
- ``text_encoder_cpu_offload`` ↔ ``engine.offload.text_encoder``
- ``vae_cpu_offload`` ↔ ``engine.offload.vae``
- ``ltx2_vae_tiling`` ↔ ``pipeline.vae_tiling``
- ``ltx2_refine_enabled`` ↔ ``pipeline.preset_overrides["refine"]["enabled"]``
- ``ltx2_refine_upsampler_path`` ↔ ``pipeline.components.upsampler_weights``
- ``ltx2_refine_lora_path`` ↔ ``pipeline.components.lora_path``
- ``ltx2_refine_num_inference_steps`` ↔ ``pipeline.preset_overrides["refine"]["num_inference_steps"]``
- ``ltx2_refine_guidance_scale`` ↔ ``pipeline.preset_overrides["refine"]["guidance_scale"]``
- ``ltx2_refine_add_noise`` ↔ ``pipeline.preset_overrides["refine"]["add_noise"]``
- ``pipeline_config=PipelineConfig.from_pretrained(model_root)`` ↔ (no-op — ``PipelineConfig.from_kwargs`` already resolves the model-specific class from ``model_path``)
- ``pipeline_config.dit_config.quant_config = None`` ↔ leave ``engine.quantization`` unset
- ``ltx2_images`` / ``ltx2_image_crf`` ↔ ``request.extensions`` (LTX-2 specific, no
first-class typed field yet)
"""
from __future__ import annotations
import os
import time
from collections import OrderedDict
from pathlib import Path
import torch._inductor.config as _inductor
from fastvideo import VideoGenerator
from fastvideo.api import (
CompileConfig,
ComponentConfig,
EngineConfig,
GenerationRequest,
GeneratorConfig,
OffloadConfig,
OutputConfig,
PipelineSelection,
SamplingConfig,
)
from fastvideo.utils import maybe_download_model
os.environ.setdefault("FASTVIDEO_ATTENTION_BACKEND", "FLASH_ATTN")
os.environ.setdefault("FASTVIDEO_STAGE_LOGGING", "1")
# Inductor knobs. ``shape_padding=False`` is mandatory on Blackwell to
# avoid a cuBLAS INVALID_VALUE crash inside pad_mm during the refine
# path. The rest are autotune-friendliness flags.
_inductor.shape_padding = False
_inductor.conv_1x1_as_mm = True
_inductor.coordinate_descent_tuning = True
_inductor.coordinate_descent_check_all_directions = True
_inductor.epilogue_fusion = False
MODEL_ID = os.path.expandvars(
os.path.expanduser(
os.getenv("LTX23_MODEL_PATH", "FastVideo/LTX-2.3-Distilled-Diffusers")
)
)
OUTPUT_DIR = Path(
os.getenv(
"LTX23_OUTPUT_DIR", "outputs_video/ltx2_3_distilled_i2v_typed"
)
)
I2V_IMAGE = os.getenv("LTX23_I2V_IMAGE", "")
DEFAULT_PROMPT = (
"A fashion model takes a slow step forward and shifts her weight, "
"the soft fabric of her clothing swaying and rippling with the "
"motion, her hair shifting gently, soft even studio lighting on a "
"clean light background, elegant slow-motion runway feel."
)
PROMPT = os.getenv("LTX23_I2V_PROMPT", DEFAULT_PROMPT)
def _print_stage_breakdown(result, label: str) -> float | None:
logging_info = getattr(result, "logging_info", None)
stages = getattr(logging_info, "stages", None) if logging_info else None
if not stages:
print(f" [{label}] stage breakdown unavailable")
return None
print(f" [{label}] stage breakdown:")
total = 0.0
for name, metrics in stages.items():
exec_s = float(metrics.get("execution_time", 0.0))
total += exec_s
print(f" - {name}: {exec_s:.3f}s")
print(f" - stage_sum: {total:.3f}s")
return total
def _collect_stage_times(
result,
stage_times: dict[str, list[float]],
stage_order: OrderedDict[str, None],
) -> None:
logging_info = getattr(result, "logging_info", None)
stages = getattr(logging_info, "stages", None) if logging_info else None
if not stages:
return
for name, metrics in stages.items():
stage_order.setdefault(name, None)
stage_times.setdefault(name, []).append(
float(metrics.get("execution_time", 0.0))
)
def _resolve_refine_upsampler(model_root: str) -> Path:
for name in ("spatial_upscaler", "spatial_upsampler"):
cand = Path(model_root) / name
if (cand / "config.json").is_file():
return cand
raise FileNotFoundError(
f"No refine upsampler directory under {model_root}. "
f"Expected `{model_root}/spatial_upscaler/config.json`."
)
def main() -> None:
if not I2V_IMAGE:
raise SystemExit(
"LTX23_I2V_IMAGE is required for i2v. Example:\n"
" export LTX23_I2V_IMAGE=/path/to/portrait_or_product.jpg\n"
" python examples/inference/basic/"
"basic_ltx2_3_distilled_i2v_typed.py"
)
if not Path(I2V_IMAGE).is_file():
raise SystemExit(f"LTX23_I2V_IMAGE not found: {I2V_IMAGE}")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
model_root = maybe_download_model(MODEL_ID)
refine_upsampler_path = _resolve_refine_upsampler(model_root)
print(f"Model: {model_root}")
print(f"Refine upsampler: {refine_upsampler_path}")
print(f"i2v image: {I2V_IMAGE}")
print(f"Output dir: {OUTPUT_DIR.resolve()}")
# mode="default" — Inductor's default schedule matches max-autotune on
# this pipeline (denoise/refine/decode all within ~5 ms, n=2) while
# saving ~7 min of cold compile on a single GB200.
generator_config = GeneratorConfig(
model_path=model_root,
engine=EngineConfig(
num_gpus=1,
# Keep DiT / text encoder / VAE resident on GPU — no CPU offload
# for serving-style runs. ``image_encoder`` and
# ``pin_cpu_memory`` are left at their schema defaults
# (matches the legacy example, which only set these three).
offload=OffloadConfig(
dit=False,
text_encoder=False,
vae=False,
),
compile=CompileConfig(
enabled=True,
text_encoder_enabled=True,
# ``vae_enabled`` triggers ``_compile_with_conditions`` on
# ``LTX2CausalVideoAutoencoder``, which compiles just the
# encoder/decoder submodules and leaves the surrounding
# tiling control flow eager (required for ``fullgraph``).
# Empty ``vae_kwargs`` → inherits the master kwargs below.
vae_enabled=True,
backend="inductor",
fullgraph=True,
mode="default",
dynamic=False,
),
),
pipeline=PipelineSelection(
# ``PipelineConfig.from_kwargs`` resolves the model-specific
# pipeline-config class from ``model_path`` automatically, so we
# don't need to set ``components.pipeline_config_path`` — the
# model-specific VAE precision / decoder defaults are picked up
# the same way the legacy example's
# ``PipelineConfig.from_pretrained(model_root)`` did them.
components=ComponentConfig(
upsampler_weights=str(refine_upsampler_path),
# Distilled has no refine LoRA — omit ``lora_path``.
),
vae_tiling=False,
preset_overrides={
"refine": {
"enabled": True,
"num_inference_steps": 3,
"guidance_scale": 1.0,
"add_noise": True,
},
},
),
)
generator = VideoGenerator.from_config(generator_config)
def build_request(out_path: Path, seed: int) -> GenerationRequest:
return GenerationRequest(
prompt=PROMPT,
# distilled is CFG-free; no negative prompt
negative_prompt="",
sampling=SamplingConfig(
num_videos_per_prompt=1,
seed=seed,
height=1280,
width=832,
num_frames=121,
fps=24,
num_inference_steps=8,
guidance_scale=1.0,
),
output=OutputConfig(
output_path=str(out_path),
save_video=True,
return_frames=False,
),
# LTX-2.3 i2v fields don't have first-class typed slots yet;
# extensions is the documented bridge. ``ltx2_image_crf=0.0``
# skips an extra JPEG re-encode of an already JPEG image.
extensions={
"ltx2_images": [(I2V_IMAGE, 0, 1.0)],
"ltx2_image_crf": 0.0,
},
)
warmup_runs = 2
measured_runs = 2
warmup_secs: list[float] = []
measured_secs: list[float] = []
stage_times: dict[str, list[float]] = {}
stage_order: OrderedDict[str, None] = OrderedDict()
try:
for w in range(warmup_runs):
print(f"\n[warmup {w + 1}/{warmup_runs}] compiling + generating…")
t0 = time.perf_counter()
generator.generate(
build_request(
OUTPUT_DIR / f"_warmup_{w + 1}.mp4", seed=7
)
)
dt = time.perf_counter() - t0
warmup_secs.append(dt)
print(f"[warmup {w + 1}/{warmup_runs}] wall={dt:.1f}s")
for w in range(warmup_runs):
(OUTPUT_DIR / f"_warmup_{w + 1}.mp4").unlink(missing_ok=True)
for m in range(measured_runs):
out_path = (
OUTPUT_DIR
/ f"output_ltx2_3_distilled_i2v_typed_run_{m + 1}.mp4"
)
print(
f"\n[measured {m + 1}/{measured_runs}] generating: {out_path}"
)
t0 = time.perf_counter()
result = generator.generate(
build_request(out_path, seed=2002 + m)
)
wall = time.perf_counter() - t0
# ``e2e_latency`` is currently surfaced via ``result.extra``;
# ``GenerationResult`` exposes ``generation_time`` as a
# first-class field but the LTX-2 pipeline only fills the
# legacy ``e2e_latency`` key. Prefer the explicit one, fall
# back to wall-clock.
e2e = (
result.extra.get("e2e_latency")
if hasattr(result, "extra") else None
) or wall
measured_secs.append(e2e)
print(
f"[measured {m + 1}/{measured_runs}] "
f"e2e={e2e:.2f}s wall={wall:.2f}s"
)
_print_stage_breakdown(result, f"measured {m + 1}")
_collect_stage_times(result, stage_times, stage_order)
print("\n=== summary ===")
print(
f"warmup wall-times: "
f"{[round(x, 1) for x in warmup_secs]}"
)
if measured_secs:
avg = sum(measured_secs) / len(measured_secs)
print(
f"measured e2e (n={len(measured_secs)}): "
f"{[round(x, 2) for x in measured_secs]} -> avg {avg:.2f}s"
)
if stage_times:
print(f"average stage times over {measured_runs} measured runs:")
avg_total = 0.0
for name in stage_order:
vals = stage_times.get(name) or []
if not vals:
continue
avg_v = sum(vals) / len(vals)
avg_total += avg_v
print(f" - {name}: {avg_v:.3f}s")
print(f" - stage_sum_avg: {avg_total:.3f}s")
finally:
generator.shutdown()
if __name__ == "__main__":
main()
basic_ltx2_distilled.py
from fastvideo import VideoGenerator
PROMPT = (
"A warm sunny backyard. The camera starts in a tight cinematic close-up "
"of a woman and a man in their 30s, facing each other with serious "
"expressions. The woman, emotional and dramatic, says softly, \"That's "
"it... Dad's lost it. And we've lost Dad.\" The man exhales, slightly "
"annoyed: \"Stop being so dramatic, Jess.\" A beat. He glances aside, "
"then mutters defensively, \"He's just having fun.\" The camera slowly "
"pans right, revealing the grandfather in the garden wearing enormous "
"butterfly wings, waving his arms in the air like he's trying to take "
"off. He shouts, \"Wheeeew!\" as he flaps his wings with full commitment. "
"The woman covers her face, on the verge of tears. The tone is deadpan, "
"absurd, and quietly tragic."
)
import os
os.environ["FASTVIDEO_ATTENTION_BACKEND"] = "FLASH_ATTN"
def main() -> None:
generator = VideoGenerator.from_pretrained(
"FastVideo/LTX2-Distilled-Diffusers",
num_gpus=4,
)
output_path = "outputs_video/ltx2_basic/output_ltx2_distilled_t2v.mp4"
generator.generate_video(
prompt=PROMPT,
output_path=output_path,
save_video=True,
)
generator.shutdown()
if __name__ == "__main__":
main()
basic_ltx2_distilled_fast_profile.py
# SPDX-License-Identifier: Apache-2.0
import json
import os
import time
from collections import OrderedDict
from pathlib import Path
import torch
import torch._inductor.config
from fastvideo import VideoGenerator
from fastvideo.configs.pipelines.base import PipelineConfig
from fastvideo.layers.quantization.nvfp4_config import NVFP4Config
from fastvideo.utils import maybe_download_model
VALIDATION_JSON = (
Path(__file__).resolve().parents[2] / "training" / "finetune" / "ltx2" / "validation.json"
)
# Override with a local snapshot or converted directory when needed, e.g.
# export LTX2_MODEL_PATH=/raid/$USER/hf/FastVideo/LTX2-Distilled-Diffusers
MODEL_ID = os.path.expandvars(
os.path.expanduser(os.getenv("LTX2_MODEL_PATH", "FastVideo/LTX2-Distilled-Diffusers"))
)
OUTPUT_DIR = Path("outputs_video/ltx2_distilled_fast_profile")
os.environ["FASTVIDEO_ATTENTION_BACKEND"] = "FLASH_ATTN"
os.environ["FASTVIDEO_STAGE_LOGGING"] = "1"
# Tune Inductor flags
config = torch._inductor.config
config.conv_1x1_as_mm = True # treat 1x1 convolutions as matrix muls
config.coordinate_descent_tuning = True
config.coordinate_descent_check_all_directions = True
config.epilogue_fusion = False # do not fuse pointwise ops into matmuls
def load_validation_entries(path: Path) -> list[dict]:
with path.open(encoding="utf-8") as f:
data = json.load(f)
if isinstance(data, dict) and isinstance(data.get("data"), list):
return [entry for entry in data["data"] if isinstance(entry, dict)]
raise ValueError(f"Unsupported validation format in {path}. Expected {{'data': [...]}}.")
def print_stage_breakdown(
result: dict,
run_idx: int,
num_runs: int,
) -> float | None:
logging_info = result.get("logging_info")
if logging_info is None:
print(f"[{run_idx}/{num_runs}] Stage breakdown unavailable: no logging_info")
return None
stages = getattr(logging_info, "stages", None)
if not stages:
print(f"[{run_idx}/{num_runs}] Stage breakdown unavailable: no stage timings")
return None
print(f"[{run_idx}/{num_runs}] Stage breakdown:")
total = 0.0
for stage_name, stage_metrics in stages.items():
exec_time = float(stage_metrics.get("execution_time", 0.0))
total += exec_time
print(f" - {stage_name}: {exec_time:.3f}s")
print(f" - total(stage sum): {total:.3f}s")
return total
def extract_sr_forward_latency(
result: dict,
) -> tuple[float | None, list[tuple[str, float]], list[str]]:
logging_info = result.get("logging_info")
if logging_info is None:
return None, [], []
stages = getattr(logging_info, "stages", None)
if not stages:
return None, [], []
stage_names = list(stages.keys())
sr_match_substr = os.getenv("FASTVIDEO_SR_LATENCY_STAGE_SUBSTR", "").strip().lower()
sr_stage_entries: list[tuple[str, float]] = []
for stage_name, stage_metrics in stages.items():
stage_name_l = stage_name.lower()
if sr_match_substr:
is_sr_stage = sr_match_substr in stage_name_l
else:
is_sr_stage = (
"srdenoisingstage" in stage_name_l
or "sr_denoising" in stage_name_l
or "upsample" in stage_name_l
or ("refine" in stage_name_l and "denois" in stage_name_l)
)
if not is_sr_stage:
continue
exec_time = float(stage_metrics.get("execution_time", 0.0))
sr_stage_entries.append((stage_name, exec_time))
if not sr_stage_entries:
return None, [], stage_names
return sum(x[1] for x in sr_stage_entries), sr_stage_entries, stage_names
def collect_stage_times(
result: dict,
stage_times: dict[str, list[float]],
stage_order: OrderedDict[str, None],
) -> None:
logging_info = result.get("logging_info")
if logging_info is None:
return
stages = getattr(logging_info, "stages", None)
if not stages:
return
for stage_name, stage_metrics in stages.items():
stage_order.setdefault(stage_name, None)
exec_time = float(stage_metrics.get("execution_time", 0.0))
stage_times.setdefault(stage_name, []).append(exec_time)
def print_stage_averages(
stage_times: dict[str, list[float]],
stage_order: OrderedDict[str, None],
measured_runs: int,
) -> None:
if measured_runs <= 0:
return
if not stage_times:
print("No stage timings collected for measured runs.")
return
print(f"Average stage times over {measured_runs} measured runs:")
total_avg = 0.0
for stage_name in stage_order.keys():
times = stage_times.get(stage_name, [])
if not times:
continue
avg = sum(times) / len(times)
total_avg += avg
print(f" - {stage_name}: {avg:.3f}s")
print(f" - total(stage sum avg): {total_avg:.3f}s")
def resolve_refine_upsampler_path(model_root: str) -> Path:
root = Path(model_root)
candidates = [
root / "spatial_upscaler",
root / "spatial_upsampler",
]
env_path = os.getenv("LTX2_REFINE_UPSAMPLER_PATH")
if env_path:
candidates.insert(0, Path(os.path.expandvars(os.path.expanduser(env_path))))
for candidate in candidates:
if (candidate / "config.json").is_file():
return candidate
checked = "\n".join(f" - {candidate}" for candidate in candidates)
raise FileNotFoundError(
"Could not find an LTX2 refine upsampler directory.\n"
"Checked:\n"
f"{checked}"
)
def main() -> None:
if not VALIDATION_JSON.exists():
raise FileNotFoundError(f"Validation file not found: {VALIDATION_JSON}")
validation_entries = load_validation_entries(VALIDATION_JSON)
if not validation_entries:
raise ValueError(f"No validation entries found in {VALIDATION_JSON}")
benchmark_entry = validation_entries[0]
prompt = benchmark_entry.get("caption")
if not isinstance(prompt, str) or not prompt.strip():
raise ValueError("First validation entry is missing a usable caption")
num_runs = 12
warmup_runs = 2
avg_window = num_runs - warmup_runs
measured_start_idx = max(warmup_runs, num_runs - avg_window)
model_root = maybe_download_model(MODEL_ID)
refine_upsampler_path = resolve_refine_upsampler_path(model_root)
print(f"Using refine upsampler: {refine_upsampler_path}")
pipeline_config = PipelineConfig.from_pretrained(model_root)
pipeline_config.dit_config.quant_config = NVFP4Config()
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
torch_compile_kwargs = {
"backend": "inductor",
"fullgraph": True,
# Uncomment for final best-performance profiling. It is disabled
# for faster development iteration because autotuning is slow.
# "mode": "max-autotune-no-cudagraphs",
"dynamic": False,
}
generator = VideoGenerator.from_pretrained(
model_root,
num_gpus=1,
ltx2_refine_enabled=True,
ltx2_refine_upsampler_path=str(refine_upsampler_path),
refine_lora_path="", # keep refine LoRA disabled in this repo's typed adapter
ltx2_refine_lora_path="", # keep refine LoRA disabled for distilled model
ltx2_refine_num_inference_steps=2,
ltx2_refine_guidance_scale=1.0,
ltx2_refine_add_noise=True,
pipeline_config=pipeline_config,
enable_torch_compile=True,
enable_torch_compile_text_encoder=True,
enable_torch_compile_vae=True,
torch_compile_kwargs=torch_compile_kwargs,
torch_compile_kwargs_vae=torch_compile_kwargs,
dit_cpu_offload=False,
text_encoder_cpu_offload=False,
vae_cpu_offload=False,
ltx2_vae_tiling=False,
)
run_times: list[float] = []
e2e_times: list[float] = []
sr_forward_times: list[float] = []
non_stage_overhead_times: list[float] = []
stage_times: dict[str, list[float]] = {}
stage_order: OrderedDict[str, None] = OrderedDict()
try:
for i in range(num_runs):
output_path = OUTPUT_DIR / f"output_ltx2_basic_t2v_run_{i + 1}.mp4"
if output_path.exists():
output_path.unlink()
print(f"[{i + 1}/{num_runs}] Removed existing file: {output_path}")
print(f"[{i + 1}/{num_runs}] Generating: {output_path}")
if os.environ.get("FASTVIDEO_STAGE_LOGGING") == "0" and torch.cuda.is_available():
torch.cuda.synchronize()
start = time.perf_counter()
result = generator.generate_video(
prompt=prompt,
output_path=str(output_path),
fps=24,
seed=10,
save_video=True,
guidance_scale=1.0,
height=benchmark_entry.get("height", 1088),
width=benchmark_entry.get("width", 1920),
num_frames=121,
num_inference_steps=5,
# image_path="examples/inference/basic/prompt1.png",
# ltx2_image_crf=0.0
)
if os.environ.get("FASTVIDEO_STAGE_LOGGING") == "0":
torch.cuda.synchronize()
elapsed = result.get("generation_time") if isinstance(result, dict) else None
e2e_elapsed = result.get("e2e_latency") if isinstance(result, dict) else None
if elapsed is None:
elapsed = time.perf_counter() - start
if e2e_elapsed is None:
e2e_elapsed = time.perf_counter() - start
run_times.append(elapsed)
e2e_times.append(e2e_elapsed)
print(f"[{i + 1}/{num_runs}] Generation time: {elapsed:.2f}s")
print(f"[{i + 1}/{num_runs}] End-to-end latency: {e2e_elapsed:.2f}s")
if isinstance(result, dict):
stage_sum = print_stage_breakdown(result, i + 1, num_runs)
if stage_sum is not None:
non_stage_overhead = e2e_elapsed - stage_sum
print(f"[{i + 1}/{num_runs}] Non-stage overhead (e2e - stage sum): {non_stage_overhead:.3f}s")
if i >= measured_start_idx:
non_stage_overhead_times.append(non_stage_overhead)
sr_forward_total, sr_stage_entries, stage_names = extract_sr_forward_latency(result)
if sr_forward_total is None:
print(f"[{i + 1}/{num_runs}] SR forward latency unavailable")
if stage_names:
print(f" Available stage keys: {', '.join(stage_names)}")
print(" Tip: set FASTVIDEO_SR_LATENCY_STAGE_SUBSTR=<substring> to match your SR stage key.")
else:
print(f"[{i + 1}/{num_runs}] SR forward latency: {sr_forward_total:.3f}s")
for sr_stage_name, sr_exec_time in sr_stage_entries:
print(f" - {sr_stage_name}: {sr_exec_time:.3f}s")
if i >= measured_start_idx:
sr_forward_times.append(sr_forward_total)
if i >= measured_start_idx:
collect_stage_times(result, stage_times, stage_order)
measured_times = run_times[measured_start_idx:]
avg_time = sum(measured_times) / len(measured_times)
print(
f"Average video generation time over {len(measured_times)} runs "
f"(runs {measured_start_idx + 1}-{len(run_times)}, skipping first {warmup_runs} warmup runs): "
f"{avg_time:.2f}s"
)
measured_e2e_times = e2e_times[measured_start_idx:]
avg_e2e_time = sum(measured_e2e_times) / len(measured_e2e_times)
print(
f"Average end-to-end latency over {len(measured_e2e_times)} runs "
f"(runs {measured_start_idx + 1}-{len(e2e_times)}, skipping first {warmup_runs} warmup runs): "
f"{avg_e2e_time:.2f}s"
)
if sr_forward_times:
avg_sr_forward = sum(sr_forward_times) / len(sr_forward_times)
print(f"Average SR forward latency over {len(sr_forward_times)} runs: {avg_sr_forward:.3f}s")
else:
print("Average SR forward latency unavailable (no SR stages matched).")
print_stage_averages(stage_times, stage_order, len(measured_times))
if non_stage_overhead_times:
avg_non_stage_overhead = sum(non_stage_overhead_times) / len(non_stage_overhead_times)
print(
"Average non-stage overhead over "
f"{len(non_stage_overhead_times)} measured runs: {avg_non_stage_overhead:.3f}s"
)
else:
print("Average non-stage overhead unavailable (no stage timings).")
finally:
generator.shutdown()
if __name__ == "__main__":
main()
basic_lucy_edit.py
from fastvideo import VideoGenerator
OUTPUT_PATH = "video_samples_lucy_edit"
def main():
generator = VideoGenerator.from_pretrained(
"decart-ai/Lucy-Edit-Dev",
num_gpus=1,
use_fsdp_inference=False,
dit_cpu_offload=True,
vae_cpu_offload=False,
text_encoder_cpu_offload=True,
pin_cpu_memory=True,
)
prompt = ("Change the apron and blouse to a classic clown costume: satin "
"polka-dot jumpsuit in bright primary colors, ruffled white collar, "
"oversized pom-pom buttons, white gloves, oversized red shoes, red "
"foam nose; soft window light from left, eye-level medium shot.")
video_path = "https://d2drjpuinn46lb.cloudfront.net/painter_original_edit.mp4"
generator.generate_video(
prompt,
negative_prompt="",
video_path=video_path,
output_path=OUTPUT_PATH,
save_video=True,
height=480,
width=832,
num_frames=81,
fps=24,
guidance_scale=5.0,
)
if __name__ == "__main__":
main()
basic_matrixgame2.py
from fastvideo import VideoGenerator
from fastvideo.models.dits.matrixgame2.utils import create_action_presets
import torch
# Available variants: "base_distilled_model", "gta_distilled_model", "templerun_distilled_model"
# Each variant has different keyboard_dim:
# - base_distilled_model: keyboard_dim=4
# - gta_distilled_model: keyboard_dim=2
# - templerun_distilled_model: keyboard_dim=7 (keyboard only, no mouse)
MODEL_VARIANT = "base_distilled_model"
# Variant-specific settings
VARIANT_CONFIG = {
"base_distilled_model": {
"model_path": "FastVideo/Matrix-Game-2.0-Base-Distilled-Diffusers",
"keyboard_dim": 4,
"image_url": "https://raw.githubusercontent.com/SkyworkAI/Matrix-Game/main/Matrix-Game-2/demo_images/universal/0000.png",
},
"gta_distilled_model": {
"model_path": "FastVideo/Matrix-Game-2.0-GTA-Distilled-Diffusers",
"keyboard_dim": 2,
"image_url": "https://raw.githubusercontent.com/SkyworkAI/Matrix-Game/main/Matrix-Game-2/demo_images/gta_drive/0000.png",
},
"templerun_distilled_model": {
"model_path": "FastVideo/Matrix-Game-2.0-TempleRun-Distilled-Diffusers",
"keyboard_dim": 7,
"image_url": "https://raw.githubusercontent.com/SkyworkAI/Matrix-Game/main/Matrix-Game-2/demo_images/temple_run/0000.png",
},
}
OUTPUT_PATH = "video_samples_matrixgame2"
def main():
# FastVideo will automatically use the optimal default arguments for the
# model.
# If a local path is provided, FastVideo will make a best effort
# attempt to identify the optimal arguments.
config = VARIANT_CONFIG[MODEL_VARIANT]
generator = VideoGenerator.from_pretrained(
config["model_path"],
# FastVideo will automatically handle distributed setup
num_gpus=1,
use_fsdp_inference=False, # set to True if GPU is out of memory
dit_cpu_offload=True, # DiT need to be offloaded for MoE
vae_cpu_offload=False,
text_encoder_cpu_offload=True,
# Set pin_cpu_memory to false if CPU RAM is limited and there're no frequent CPU-GPU transfer
pin_cpu_memory=True,
# image_encoder_cpu_offload=False,
)
num_frames = 597
actions = create_action_presets(num_frames, keyboard_dim=config["keyboard_dim"])
grid_sizes = torch.tensor([150, 44, 80])
generator.generate_video(
prompt="",
image_path=config["image_url"],
mouse_cond=actions["mouse"].unsqueeze(0),
keyboard_cond=actions["keyboard"].unsqueeze(0),
grid_sizes=grid_sizes,
num_frames=num_frames,
height=352,
width=640,
num_inference_steps=50,
output_path=OUTPUT_PATH,
save_video=True,
)
if __name__ == "__main__":
main()
basic_matrixgame2_streaming.py
from fastvideo.entrypoints.streaming_generator import StreamingVideoGenerator
from fastvideo.models.dits.matrixgame2.utils import get_current_action_async, expand_action_to_frames
import torch
import asyncio
# Available variants: "base_distilled_model", "gta_distilled_model", "templerun_distilled_model"
# Each variant has different keyboard_dim:
# - base_distilled_model: keyboard_dim=4
# - gta_distilled_model: keyboard_dim=2
# - templerun_distilled_model: keyboard_dim=7 (keyboard only, no mouse)
MODEL_VARIANT = "base_distilled_model"
# Variant-specific settings
VARIANT_CONFIG = {
"base_distilled_model": {
"model_path": "FastVideo/Matrix-Game-2.0-Base-Distilled-Diffusers",
"keyboard_dim": 4,
"mode": "universal",
"image_url": "https://raw.githubusercontent.com/SkyworkAI/Matrix-Game/main/Matrix-Game-2/demo_images/universal/0000.png",
},
"gta_distilled_model": {
"model_path": "FastVideo/Matrix-Game-2.0-GTA-Distilled-Diffusers",
"keyboard_dim": 2,
"mode": "gta_drive",
"image_url": "https://raw.githubusercontent.com/SkyworkAI/Matrix-Game/main/Matrix-Game-2/demo_images/gta_drive/0000.png",
},
"templerun_distilled_model": {
"model_path": "FastVideo/Matrix-Game-2.0-TempleRun-Distilled-Diffusers",
"keyboard_dim": 7,
"mode": "templerun",
"image_url": "https://raw.githubusercontent.com/SkyworkAI/Matrix-Game/main/Matrix-Game-2/demo_images/temple_run/0000.png",
},
}
OUTPUT_PATH = "video_samples_matrixgame2"
async def main():
# FastVideo will automatically use the optimal default arguments for the
# model.
# If a local path is provided, FastVideo will make a best effort
# attempt to identify the optimal arguments.
config = VARIANT_CONFIG[MODEL_VARIANT]
generator = StreamingVideoGenerator.from_pretrained(
config["model_path"],
# FastVideo will automatically handle distributed setup
num_gpus=1,
use_fsdp_inference=False, # set to True if GPU is out of memory
dit_cpu_offload=True, # DiT need to be offloaded for MoE
vae_cpu_offload=False,
text_encoder_cpu_offload=True,
# Set pin_cpu_memory to false if CPU RAM is limited and there're no frequent CPU-GPU transfer
pin_cpu_memory=True,
# image_encoder_cpu_offload=False,
)
max_blocks = 50
num_frames = 597
actions = {
"keyboard": torch.zeros((num_frames, config["keyboard_dim"])),
"mouse": torch.zeros((num_frames, 2))
}
grid_sizes = torch.tensor([150, 44, 80])
mode = config["mode"]
generator.reset(
prompt="",
image_path=config["image_url"],
mouse_cond=actions["mouse"].unsqueeze(0),
keyboard_cond=actions["keyboard"].unsqueeze(0),
grid_sizes=grid_sizes,
num_frames=num_frames,
height=352,
width=640,
num_inference_steps=50,
output_path=OUTPUT_PATH,
save_video=True,
)
print("Initialization complete.")
for block_id in range(max_blocks):
print(f"\n=== Block {block_id + 1}/{max_blocks} ===")
action = await get_current_action_async(mode)
keyboard_cond, mouse_cond = expand_action_to_frames(action, 12)
await generator.step_async(keyboard_cond, mouse_cond)
if (await asyncio.to_thread(input, "\nContinue? (y/n): ")).lower() == 'n':
break
# Save final video
generator.finalize()
generator.shutdown()
if __name__ == "__main__":
asyncio.run(main())
basic_matrixgame3.py
from fastvideo import VideoGenerator
MODEL_PATH = "FastVideo/Matrix-Game-3.0-Base-Distilled-Diffusers"
IMAGE_URL = "https://raw.githubusercontent.com/SkyworkAI/Matrix-Game/main/Matrix-Game-3/demo_images/001/image.png"
PROMPT = "A colorful, animated cityscape with a gas station and various buildings."
OUTPUT_PATH = "video_samples_matrixgame3"
def main():
generator = VideoGenerator.from_pretrained(
MODEL_PATH,
num_gpus=1,
use_fsdp_inference=False,
dit_cpu_offload=False,
vae_cpu_offload=False,
text_encoder_cpu_offload=True,
pin_cpu_memory=True,
)
generator.generate_video(
prompt=PROMPT,
image_path=IMAGE_URL,
height=720,
width=1280,
num_frames=57,
num_inference_steps=3,
guidance_scale=1.0,
seed=42,
output_path=OUTPUT_PATH,
save_video=True,
)
if __name__ == "__main__":
main()
basic_mps.py
from fastvideo import VideoGenerator, PipelineConfig
from fastvideo.api.sampling_param import SamplingParam
def main():
config = PipelineConfig.from_pretrained("Wan-AI/Wan2.1-T2V-1.3B-Diffusers")
config.text_encoder_precisions = ["fp16"]
generator = VideoGenerator.from_pretrained(
"Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
pipeline_config=config,
use_fsdp_inference=False, # Disable FSDP for MPS
dit_cpu_offload=True,
text_encoder_cpu_offload=True,
pin_cpu_memory=True,
disable_autocast=False,
num_gpus=1,
)
# Create sampling parameters with reduced number of frames
sampling_param = SamplingParam.from_pretrained("Wan-AI/Wan2.1-T2V-1.3B-Diffusers")
sampling_param.num_frames = 25 # Reduce from default 81 to 25 frames bc we have to use the SDPA attn backend for mps
sampling_param.height = 256
sampling_param.width = 256
prompt = ("A curious raccoon peers through a vibrant field of yellow sunflowers, its eyes "
"wide with interest. The playful yet serene atmosphere is complemented by soft "
"natural light filtering through the petals. Mid-shot, warm and cheerful tones.")
video = generator.generate_video(prompt, sampling_param=sampling_param)
prompt2 = ("A majestic lion strides across the golden savanna, its powerful frame "
"glistening under the warm afternoon sun. The tall grass ripples gently in "
"the breeze, enhancing the lion's commanding presence. The tone is vibrant, "
"embodying the raw energy of the wild. Low angle, steady tracking shot, "
"cinematic.")
video2 = generator.generate_video(prompt2, sampling_param=sampling_param)
if __name__ == "__main__":
main()
basic_ray.py
from fastvideo import VideoGenerator
# from fastvideo.api.sampling_param import SamplingParam
OUTPUT_PATH = "video_samples"
def main():
# FastVideo will automatically use the optimal default arguments for the
# model.
# If a local path is provided, FastVideo will make a best effort
# attempt to identify the optimal arguments.
generator = VideoGenerator.from_pretrained(
"Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
# FastVideo will automatically handle distributed setup
num_gpus=2,
use_fsdp_inference=True,
dit_cpu_offload=False,
vae_cpu_offload=False,
text_encoder_cpu_offload=True,
pin_cpu_memory=True, # set to false if low CPU RAM or hit obscure "CUDA error: Invalid argument"
distributed_executor_backend="ray",
# image_encoder_cpu_offload=False,
)
# Generate videos with the same simple API, regardless of GPU count
prompt = (
"A curious raccoon peers through a vibrant field of yellow sunflowers, its eyes "
"wide with interest. The playful yet serene atmosphere is complemented by soft "
"natural light filtering through the petals. Mid-shot, warm and cheerful tones."
)
video = generator.generate_video(prompt, output_path=OUTPUT_PATH, save_video=True)
# Generate another video with a different prompt, without reloading the
# model!
prompt2 = (
"A majestic lion strides across the golden savanna, its powerful frame "
"glistening under the warm afternoon sun. The tall grass ripples gently in "
"the breeze, enhancing the lion's commanding presence. The tone is vibrant, "
"embodying the raw energy of the wild. Low angle, steady tracking shot, "
"cinematic.")
video2 = generator.generate_video(prompt2, output_path=OUTPUT_PATH, save_video=True)
if __name__ == "__main__":
main()
basic_sd35_t2i.py
# SPDX-License-Identifier: Apache-2.0
from __future__ import annotations
import argparse
import os
import re
from typing import List
DEFAULT_PROMPTS = [
"a photo of a cat",
"a cinematic photo of a red panda wearing a tiny backpack, standing on a rainy neon-lit street at night, shallow depth of field, sharp focus, 35mm, bokeh",
]
def _safe_filename(text: str, max_len: int = 100) -> str:
"""
Make a stable, filesystem-friendly filename base.
VideoGenerator uses prompt[:100].strip() internally, so we mirror that,
but also remove path separators and other problematic characters.
"""
s = text[:max_len].strip()
s = s.replace(os.sep, "_")
if os.altsep:
s = s.replace(os.altsep, "_")
s = re.sub(r"\s+", " ", s)
s = re.sub(r"[^A-Za-z0-9 .,_-]", "_", s)
s = s.strip(" .")
return s or "prompt"
def _remove_existing_outputs(out_dir: str, filename_base: str) -> None:
"""
Ensure deterministic naming by deleting any existing outputs that would
cause VideoGenerator to append suffixes like _1, _2, etc.
"""
if not os.path.isdir(out_dir):
return
pattern = re.compile(rf"^{re.escape(filename_base)}(_\d+)?\.(mp4|png)$")
for fn in os.listdir(out_dir):
if pattern.match(fn):
try:
os.remove(os.path.join(out_dir, fn))
except FileNotFoundError:
pass
def parse_args() -> argparse.Namespace:
p = argparse.ArgumentParser(description="Run SD3.5 Medium text-to-image with FastVideo VideoGenerator.")
p.add_argument("--model-path", default="stabilityai/stable-diffusion-3.5-medium", help="Path to local diffusers-format SD3.5 weights directory.")
p.add_argument(
"--out-dir",
"--outdir",
default="outputs/sd35/samples",
help="Output directory for generated mp4 files.",
)
p.add_argument(
"--prompt",
action="append",
default=None,
help="Prompt text. Repeat --prompt multiple times to generate multiple samples.",
)
p.add_argument("--negative", default="lowres, blurry, jpeg artifacts, watermark, text", help="Negative prompt.")
p.add_argument(
"--backend",
default=None,
help="Set FASTVIDEO_ATTENTION_BACKEND (e.g. TORCH_SDPA). If omitted, respects the existing env var.",
)
p.add_argument("--seed", type=int, default=42, help="Base seed. Each prompt uses seed + prompt_idx.")
p.add_argument("--height", type=int, default=768, help="Output height.")
p.add_argument("--width", type=int, default=768, help="Output width.")
p.add_argument("--steps", type=int, default=28, help="Number of inference steps.")
p.add_argument("--guidance", type=float, default=6.0, help="Guidance scale.")
p.add_argument("--num-gpus", type=int, default=1, help="Number of GPUs to use.")
return p.parse_args()
def main() -> None:
args = parse_args()
prompts: List[str] = args.prompt if args.prompt else DEFAULT_PROMPTS
if args.backend:
os.environ["FASTVIDEO_ATTENTION_BACKEND"] = args.backend
from fastvideo import VideoGenerator
os.makedirs(args.out_dir, exist_ok=True)
init_kwargs = {
"num_gpus": args.num_gpus,
"workload_type": "t2i",
"sp_size": 1,
"tp_size": 1,
"dit_cpu_offload": False,
"dit_layerwise_offload": False,
"text_encoder_cpu_offload": False,
"vae_cpu_offload": False,
"image_encoder_cpu_offload": False,
"pin_cpu_memory": False,
"use_fsdp_inference": False,
}
generator = VideoGenerator.from_pretrained(model_path=args.model_path, **init_kwargs)
try:
for i, prompt in enumerate(prompts):
seed = args.seed + i
filename_base = f"sd35_{i:02d}_seed{seed}_{_safe_filename(prompt, max_len=80)}"
_remove_existing_outputs(args.out_dir, filename_base)
output_path = os.path.join(args.out_dir, f"{filename_base}.png")
print(f"[sd35] prompt_idx={i} seed={seed} output_path={output_path}")
generation_kwargs = {
"output_path": output_path,
"height": args.height,
"width": args.width,
"num_frames": 1,
"fps": 1,
"num_inference_steps": args.steps,
"guidance_scale": args.guidance,
"seed": seed,
"negative_prompt": args.negative,
"save_video": True,
}
generator.generate_video(prompt, **generation_kwargs)
print(f"[sd35] done. outputs written to: {args.out_dir}")
finally:
generator.shutdown()
if __name__ == "__main__":
main()
basic_self_forcing_causal.py
import os
import time
from fastvideo import VideoGenerator, SamplingParam
OUTPUT_PATH = "video_samples_causal"
def main():
# FastVideo will automatically use the optimal default arguments for the
# model.
# If a local path is provided, FastVideo will make a best effort
# attempt to identify the optimal arguments.
model_name = "wlsaidhi/SFWan2.1-T2V-1.3B-Diffusers"
generator = VideoGenerator.from_pretrained(
model_name,
# FastVideo will automatically handle distributed setup
num_gpus=1,
use_fsdp_inference=False, # set to True if GPU is out of memory
text_encoder_cpu_offload=False,
dit_cpu_offload=False,
)
sampling_param = SamplingParam.from_pretrained(model_name)
prompt = (
"A curious raccoon peers through a vibrant field of yellow sunflowers, its eyes "
"wide with interest. The playful yet serene atmosphere is complemented by soft "
"natural light filtering through the petals. Mid-shot, warm and cheerful tones."
)
video = generator.generate_video(prompt, output_path=OUTPUT_PATH, save_video=True, sampling_param=sampling_param)
if __name__ == "__main__":
main()
basic_self_forcing_causal_wan2_2_i2v.py
# NOTE: This is still a work in progress, and the checkpoints are not released yet.
from fastvideo import VideoGenerator, SamplingParam
import json
# from fastvideo.api.sampling_param import SamplingParam
OUTPUT_PATH = "video_samples_self_forcing_causal_wan2_2_14B_i2v"
def main():
# FastVideo will automatically use the optimal default arguments for the
# model.
# If a local path is provided, FastVideo will make a best effort
# attempt to identify the optimal arguments.
generator = VideoGenerator.from_pretrained(
"FastVideo/SFWan2.2-I2V-A14B-Preview-Diffusers",
# FastVideo will automatically handle distributed setup
num_gpus=1,
use_fsdp_inference=False, # set to True if GPU is out of memory
dit_cpu_offload=True, # DiT need to be offloaded for MoE
dit_precision="fp32",
vae_cpu_offload=False,
text_encoder_cpu_offload=True,
dmd_denoising_steps=[1000, 850, 700, 550, 350, 275, 200, 125],
# Set pin_cpu_memory to false if CPU RAM is limited and there're no frequent CPU-GPU transfer
pin_cpu_memory=True,
# image_encoder_cpu_offload=False,
)
sampling_param = SamplingParam.from_pretrained("FastVideo/SFWan2.2-I2V-A14B-Preview-Diffusers")
sampling_param.num_frames = 81
sampling_param.width = 832
sampling_param.height = 480
sampling_param.seed = 1000
with open("assets/prompts/mixkit_i2v.jsonl", "r") as f:
prompt_image_pairs = json.load(f)
for prompt_image_pair in prompt_image_pairs:
prompt = prompt_image_pair["prompt"]
image_path = prompt_image_pair["image_path"]
_ = generator.generate_video(prompt, image_path=image_path, output_path=OUTPUT_PATH, save_video=True, sampling_param=sampling_param)
if __name__ == "__main__":
main()
basic_self_forcing_causal_wan2_2_t2v.py
# NOTE: This is still a work in progress, and the checkpoints are not released yet.
from fastvideo import VideoGenerator
# from fastvideo.api.sampling_param import SamplingParam
OUTPUT_PATH = "video_samples_self_forcing_causal_wan2_2_14B_t2v"
def main():
# FastVideo will automatically use the optimal default arguments for the
# model.
# If a local path is provided, FastVideo will make a best effort
# attempt to identify the optimal arguments.
generator = VideoGenerator.from_pretrained(
"rand0nmr/SFWan2.2-T2V-A14B-Diffusers",
# FastVideo will automatically handle distributed setup
num_gpus=1,
use_fsdp_inference=False, # set to True if GPU is out of memory
dit_cpu_offload=True, # DiT need to be offloaded for MoE
vae_cpu_offload=False,
text_encoder_cpu_offload=True,
dmd_denoising_steps=[1000, 850, 700, 550, 350, 275, 200, 125],
# Set pin_cpu_memory to false if CPU RAM is limited and there're no frequent CPU-GPU transfer
pin_cpu_memory=True,
init_weights_from_safetensors="/mnt/sharefs/users/hao.zhang/wei/SFwan2.2_distill_self_forcing_release_cfg2/checkpoint-246_weight_only/generator_inference_transformer/",
init_weights_from_safetensors_2="/mnt/sharefs/users/hao.zhang/wei/SFwan2.2_distill_self_forcing_release_cfg2/checkpoint-246_weight_only/generator_2_inference_transformer/",
num_frame_per_block=7,
# image_encoder_cpu_offload=False,
)
# sampling_param = SamplingParam.from_pretrained("Wan-AI/Wan2.1-T2V-1.3B-Diffusers")
# sampling_param.num_frames = 45
# sampling_param.image_path = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/astronaut.jpg"
# Generate videos with the same simple API, regardless of GPU count
prompt = (
"A curious raccoon peers through a vibrant field of yellow sunflowers, its eyes "
"wide with interest. The playful yet serene atmosphere is complemented by soft "
"natural light filtering through the petals. Mid-shot, warm and cheerful tones."
)
_ = generator.generate_video(prompt, output_path=OUTPUT_PATH, save_video=True, num_frames=81)
if __name__ == "__main__":
main()
basic_stable_audio.py
# SPDX-License-Identifier: Apache-2.0
"""Stable Audio Open 1.0 — text-to-audio (baseline) example.
User story (game-audio designer, prototyping):
"I'm prototyping a level and I need 6 seconds of background
ambience — gentle wind, distant thunder, a hint of birdsong. I
don't want to dig through a sound library; I want to type what I
hear in my head and get a wav back. If it's wrong I'll iterate
on the prompt. This is the first stop."
User story (musician sketching ideas):
"I want to bounce a 30s lo-fi drum loop to use as a placeholder
bed while I build the rest of the track. Type prompt, get audio,
drop into the DAW. The actual production beat I'll record
myself, but I need *something* to write the chords against."
User story (researcher exploring the model):
"First time touching Stable Audio Open — what does it sound
like at default settings? This is the smallest amount of code
that goes from prompt to mp4."
How it works:
Pure text-to-audio (T2A). The pipeline runs:
T5 + NumberConditioner -> StableAudioDiT -> Oobleck VAE
via the `dpmpp-3m-sde` k-diffusion sampler. All components are
FastVideo-native — no diffusers / transformers model imports at
runtime (see REVIEW item 30). Mirrors upstream
`stable_audio_tools.inference.generation.generate_diffusion_cond`
bit-for-bit (~0.2% abs_mean drift on 25 steps).
Tunable knobs (the "creative dials"):
audio_end_in_s
1–6 — quick ideation (sub-10s wall clock at 100 steps)
10–30 — full musical phrase / loop length (the README example
uses 30s)
47.5 — model maximum (full sample_size = 2097152 / 44100 Hz)
num_inference_steps
25 — fast preview, occasional artifacts
100 — preset default (matches the HF model card)
250 — diminishing returns past here
guidance_scale
3 — looser, more variation per seed
7 — preset default; matches README
12+ — sharper but can sound "fried"
Prerequisites:
1. Accept the terms on https://huggingface.co/stabilityai/stable-audio-open-1.0
and export your HF token in the shell:
export HF_TOKEN=hf_...
2. Install optional inference deps (one-time):
uv pip install k_diffusion einops_exts alias_free_torch torchsde
"""
from fastvideo import VideoGenerator
PROMPT = "Lo-fi hip hop instrumental with vinyl crackle and gentle piano."
def main() -> None:
generator = VideoGenerator.from_pretrained(
"FastVideo/stable-audio-open-1.0-Diffusers",
num_gpus=1,
)
output_path = "outputs_audio/stable_audio_basic/output_stable_audio.wav"
generator.generate_video(
prompt=PROMPT,
output_path=output_path,
save_video=True,
# 6-second clip; the model max is ~47.5s.
audio_end_in_s=6.0,
# The registered preset gives 100 steps + CFG=7.0 by default;
# override num_inference_steps / guidance_scale here for QA.
)
generator.shutdown()
if __name__ == "__main__":
main()
basic_stable_audio_a2a.py
# SPDX-License-Identifier: Apache-2.0
"""Stable Audio Open 1.0 — audio-to-audio variation example.
User story (musician, late at night):
"I generated this 12-second lo-fi loop earlier and I love the chord
progression and overall vibe, but the snare hit at 0:08 sounds wrong
and the rhythm feels stiff. I don't want to start over from scratch
and lose what's working — I want the model to keep the harmony and
mood but reroll the percussion + groove."
User story (sound designer, on a deadline):
"I have one good 'sword clang' SFX. The art director wants 8 sibling
variations that all feel like the same sword from different angles —
same metal, same weight, slightly different impact. I'd rather
refine my one good take than text-prompt my way through 50 misses."
Pass `init_audio=path/to/clip` (any wav/mp3/mp4/m4a/flac the standard
deps decode) and the model will use it as a starting point for the
text prompt instead of pure noise.
Picking `init_audio_strength` (0.0 to 1.0):
Higher = closer to the source clip. Lower = more transformation.
(Same convention as the "Input Audio Strength" slider in
Stability's commercial Stable Audio web UI, so values transfer
directly.)
| strength | what you get |
|----------|----------------------------------------------------|
| 1.00 | Output ≈ reference. No transformation. |
| 0.85 | Texture micro-variation only. |
| 0.70 | Light reroll, same instruments. |
| 0.60 | Default. Instrument identity is replaceable |
| | (cello can take over from piano on the same notes).|
| 0.50 | Heavy — only melody / chord progression survives. |
| 0.30 | Reference acts as a loose mood prompt. |
| 0.00 | Plain T2A — reference ignored. |
Rule of thumb by intent:
* "Fix one part of this clip" -> 0.75 .. 0.85
* "Same notes, different instrument" -> 0.55 .. 0.65
* "Same chord progression, new content" -> 0.40 .. 0.55
* "Use this as a loose mood prompt" -> 0.20 .. 0.35
If the reference timbre is bleeding through more than you want,
lower it; if the structure is gone, raise it.
Prerequisites: same as `basic_stable_audio.py`.
"""
from fastvideo import VideoGenerator
PROMPT = "Change the piano to a cello playing the same notes"
# Path to any audio-bearing file (wav, mp3, mp4, m4a, flac, ...).
# Set to `None` to skip A2A and run plain T2A.
INIT_AUDIO_PATH: str | None = None
# Reference fidelity in [0, 1] -- higher = closer to source.
INIT_AUDIO_STRENGTH = 0.6
def main() -> None:
generator = VideoGenerator.from_pretrained(
"FastVideo/stable-audio-open-1.0-Diffusers",
num_gpus=1,
)
generator.generate_video(
prompt=PROMPT,
output_path="outputs_audio/stable_audio_a2a/output_a2a.wav",
save_video=True,
audio_end_in_s=6.0,
init_audio=INIT_AUDIO_PATH,
init_audio_strength=INIT_AUDIO_STRENGTH,
)
generator.shutdown()
if __name__ == "__main__":
main()
basic_stable_audio_inpaint.py
# SPDX-License-Identifier: Apache-2.0
"""Stable Audio Open 1.0 — inpainting / outpainting (loop extension) example.
User story (loop extension — the killer app):
"I have a 6-second drum loop my client likes. They want it as
background bed for a 30-second ad. I need it to loop seamlessly,
but a hard cut every 6s sounds bad. Let me extend it to 30s,
keeping the first 6s exactly as-is and letting the model continue
the groove for the remaining 24s."
User story (audio repair):
"There's a microphone bump at 0:14 in this 30-second field
recording — really obvious in headphones. Mask out 0:13 to 0:15
and let the model regenerate plausible ambience that blends in.
Everything else stays exactly as I recorded it."
User story (transition smoothing):
"I have two 10-second clips I want to crossfade. Mask out a 1s
overlap region in the middle and let the model invent a coherent
transition between the two."
How it works (RePaint-style blending):
Stable Audio Open 1.0 wasn't trained as an inpainting model
(`model_type=diffusion_cond`, not `diffusion_cond_inpaint`), so we
can't use the upstream's mask-conditioned approach directly. We
use the RePaint trick instead, which works on any v-prediction
diffusion model:
1. Encode the reference clip into latent space.
2. At every denoising step `i`, replace the kept region of the
in-flight latent (where mask == 1) with the reference
re-noised to the next timestep's sigma. Only the unkept
region (mask == 0) is freely denoised.
3. After the loop, the kept region is exactly the reference;
the unkept region is freshly generated content.
This is approximate compared to a properly trained inpainting
checkpoint — the seam between kept/unkept can have slight EQ
discontinuity — but it works on the existing public model.
Tunable: the mask is a 1-D tensor in {0, 1} at the model's sample
rate. Conventions:
1.0 = keep this sample from the reference
0.0 = regenerate this sample
Prerequisites: same as `basic_stable_audio.py`.
"""
import os
from fastvideo import VideoGenerator
PROMPT = "Steady lo-fi hip hop drum loop with vinyl crackle."
# Required: path to the reference audio file (wav, mp3, mp4, m4a, flac,
# ...) you want to extend or repair. The pipeline raises if a mask is
# passed without a reference, so this must be a real path.
REFERENCE_AUDIO_PATH = "path/to/your/loop.wav"
KEEP_SECONDS = 6.0 # first KEEP_SECONDS preserved exactly
TOTAL_SECONDS = 12.0 # extend the loop to this duration
def main() -> None:
if not os.path.isfile(REFERENCE_AUDIO_PATH):
raise FileNotFoundError(
f"REFERENCE_AUDIO_PATH={REFERENCE_AUDIO_PATH!r} does not exist. "
"Edit this script to point at a real audio file (wav/mp3/mp4/"
"m4a/flac) before running.")
generator = VideoGenerator.from_pretrained(
"FastVideo/stable-audio-open-1.0-Diffusers",
num_gpus=1,
)
generator.generate_video(
prompt=PROMPT,
output_path="outputs_audio/stable_audio_inpaint/output_inpaint.wav",
save_video=True,
audio_end_in_s=TOTAL_SECONDS,
inpaint_audio=REFERENCE_AUDIO_PATH,
# Tuple form: keep first KEEP_SECONDS, regenerate the rest.
inpaint_mask=(KEEP_SECONDS, TOTAL_SECONDS),
)
generator.shutdown()
if __name__ == "__main__":
main()
basic_stable_audio_small.py
# SPDX-License-Identifier: Apache-2.0
"""Stable Audio Open Small — fast / lightweight T2A example.
User story (interactive UI builder):
"I'm building a sound-design UI where the user types a prompt and
we want sub-2-second feedback so the experience feels like
autocomplete, not a render queue. The full Stable Audio Open 1.0
takes ~8s on a single GPU; the small variant takes a fraction of
that — quality is lower but completely usable for real-time
iteration."
User story (overnight batch jobs):
"I'm generating 10,000 short SFX variants for a procedural game.
Wall-clock matters more than per-clip polish — give me the small
model so I can fit the run in one night instead of a week."
How it works:
The small variant is a separate Stability AI checkpoint
(`stabilityai/stable-audio-open-small`) that ships the same Oobleck
VAE as the 1.0 base model but a smaller / faster DiT (`embed_dim=1024`,
`depth=16`, `qk_norm="ln"`) and only one duration conditioner
(`seconds_total`, no `seconds_start`). FastVideo loads from the
converted Diffusers-format repo `FastVideo/stable-audio-open-small-Diffusers`
via the standard component loader; per-variant arch fields come
from `transformer/config.json` and `conditioner/config.json`.
Prerequisites: same as `basic_stable_audio.py`. The converted repo is
public so no gated-access flow is required.
"""
from fastvideo import VideoGenerator
PROMPT = "Lo-fi hip hop instrumental with vinyl crackle and gentle piano."
def main() -> None:
generator = VideoGenerator.from_pretrained(
"FastVideo/stable-audio-open-small-Diffusers",
num_gpus=1,
)
output_path = "outputs_audio/stable_audio_small/output_stable_audio_small.wav"
generator.generate_video(
prompt=PROMPT,
output_path=output_path,
save_video=True,
# Small variant trains on a ~11.9s window — keep `audio_end_in_s`
# at or below that.
audio_end_in_s=6.0,
)
generator.shutdown()
if __name__ == "__main__":
main()
basic_turbodiffusion.py
import os
# Set SLA attention backend BEFORE fastvideo imports
os.environ["FASTVIDEO_ATTENTION_BACKEND"] = "SLA_ATTN"
from fastvideo import VideoGenerator
OUTPUT_PATH = "video_samples_turbodiffusion"
def main() -> None:
# TurboDiffusion: 1-4 step video generation using RCM scheduler + SLA attention
# FastVideo will automatically use TurboDiffusionPipeline when specified
generator = VideoGenerator.from_pretrained(
"loayrashid/TurboWan2.1-T2V-1.3B-Diffusers",
# FastVideo will automatically handle distributed setup
num_gpus=1,
use_fsdp_inference=False, # set to True if GPU is out of memory
# set to false if using RTX 4090
# pin_cpu_memory=False,
)
# Generate videos with the same simple API, regardless of GPU count
# TurboDiffusion defaults: guidance_scale=1.0 and num_inference_steps=4 (from config)
prompt = (
"A curious raccoon peers through a vibrant field of yellow sunflowers, its eyes "
"wide with interest. The playful yet serene atmosphere is complemented by soft "
"natural light filtering through the petals. Mid-shot, warm and cheerful tones."
)
video = generator.generate_video(
prompt,
output_path=OUTPUT_PATH,
save_video=True,
seed=42,
)
# Generate another video with a different prompt, without reloading the model!
prompt2 = (
"A majestic lion strides across the golden savanna, its powerful frame "
"glistening under the warm afternoon sun. The tall grass ripples gently in "
"the breeze, enhancing the lion's commanding presence. The tone is vibrant, "
"embodying the raw energy of the wild. Low angle, steady tracking shot, "
"cinematic."
)
video2 = generator.generate_video(
prompt2,
output_path=OUTPUT_PATH,
save_video=True,
seed=42,
)
if __name__ == "__main__":
main()
basic_turbodiffusion_14b.py
import os
# Set SLA attention backend BEFORE fastvideo imports
os.environ["FASTVIDEO_ATTENTION_BACKEND"] = "SLA_ATTN"
from fastvideo import VideoGenerator
OUTPUT_PATH = "video_samples_turbodiffusion_14B"
def main() -> None:
# TurboDiffusion 14B: 1-4 step video generation using RCM scheduler + SLA attention
# FastVideo will automatically use TurboDiffusionPipeline when specified
generator = VideoGenerator.from_pretrained(
"loayrashid/TurboWan2.1-T2V-14B-Diffusers",
# 14B model needs more GPUs
num_gpus=2,
)
prompt = (
"A curious raccoon peers through a vibrant field of yellow sunflowers, its eyes "
"wide with interest. The playful yet serene atmosphere is complemented by soft "
"natural light filtering through the petals. Mid-shot, warm and cheerful tones."
)
video = generator.generate_video(
prompt,
output_path=OUTPUT_PATH,
save_video=True,
seed=42,
)
# Generate another video with a different prompt, without reloading the model!
prompt2 = (
"A majestic lion strides across the golden savanna, its powerful frame "
"glistening under the warm afternoon sun. The tall grass ripples gently in "
"the breeze, enhancing the lion's commanding presence. The tone is vibrant, "
"embodying the raw energy of the wild. Low angle, steady tracking shot, "
"cinematic."
)
video2 = generator.generate_video(
prompt2,
output_path=OUTPUT_PATH,
save_video=True,
seed=42,
)
if __name__ == "__main__":
main()
basic_turbodiffusion_i2v.py
import os
# Set SLA attention backend BEFORE fastvideo imports
os.environ["FASTVIDEO_ATTENTION_BACKEND"] = "SLA_ATTN"
from fastvideo import VideoGenerator
# Use local model path
MODEL_PATH = "loayrashid/TurboWan2.2-I2V-A14B-Diffusers"
OUTPUT_PATH = "video_samples_turbodiffusion_i2v"
def main() -> None:
# TurboDiffusion I2V: 1-4 step image-to-video generation
generator = VideoGenerator.from_pretrained(
MODEL_PATH,
num_gpus=2,
)
# Example prompt and image for I2V
prompt = ("Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside.")
# Use an example image path
image_path = "https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/wan_i2v_input.JPG"
video = generator.generate_video(
prompt,
image_path=image_path,
output_path=OUTPUT_PATH,
save_video=True,
seed=42,
)
if __name__ == "__main__":
main()
basic_wan2_2.py
from fastvideo import VideoGenerator
# from fastvideo.api.sampling_param import SamplingParam
OUTPUT_PATH = "video_samples_wan2_2_14B_t2v"
def main():
# FastVideo will automatically use the optimal default arguments for the
# model.
# If a local path is provided, FastVideo will make a best effort
# attempt to identify the optimal arguments.
generator = VideoGenerator.from_pretrained(
"Wan-AI/Wan2.2-T2V-A14B-Diffusers",
# FastVideo will automatically handle distributed setup
num_gpus=2,
use_fsdp_inference=False, # set to True if GPU is out of memory
dit_cpu_offload=True, # DiT need to be offloaded for MoE
vae_cpu_offload=False,
text_encoder_cpu_offload=True,
# Set pin_cpu_memory to false if CPU RAM is limited and there're no frequent CPU-GPU transfer
pin_cpu_memory=True,
# image_encoder_cpu_offload=False,
)
# sampling_param = SamplingParam.from_pretrained("Wan-AI/Wan2.1-T2V-1.3B-Diffusers")
# sampling_param.num_frames = 45
# sampling_param.image_path = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/astronaut.jpg"
# Generate videos with the same simple API, regardless of GPU count
prompt = (
"A curious raccoon peers through a vibrant field of yellow sunflowers, its eyes "
"wide with interest. The playful yet serene atmosphere is complemented by soft "
"natural light filtering through the petals. Mid-shot, warm and cheerful tones."
)
_ = generator.generate_video(prompt, output_path=OUTPUT_PATH, save_video=True, height=720, width=1280, num_frames=81)
# video = generator.generate_video(prompt, sampling_param=sampling_param, output_path="wan_t2v_videos/")
# Generate another video with a different prompt, without reloading the
# model!
prompt2 = (
"A majestic lion strides across the golden savanna, its powerful frame "
"glistening under the warm afternoon sun. The tall grass ripples gently in "
"the breeze, enhancing the lion's commanding presence. The tone is vibrant, "
"embodying the raw energy of the wild. Low angle, steady tracking shot, "
"cinematic.")
_ = generator.generate_video(prompt2, output_path=OUTPUT_PATH, save_video=True, height=720, width=1280, num_frames=81)
if __name__ == "__main__":
main()
basic_wan2_2_Fun.py
from fastvideo import VideoGenerator
# from fastvideo.api.sampling_param import SamplingParam
OUTPUT_PATH = "video_samples_wan2_1_Fun"
OUTPUT_NAME = "wan2.1_test"
def main():
# FastVideo will automatically use the optimal default arguments for the
# model.
# If a local path is provided, FastVideo will make a best effort
# attempt to identify the optimal arguments.
generator = VideoGenerator.from_pretrained(
"IRMChen/Wan2.1-Fun-1.3B-Control-Diffusers",
# "alibaba-pai/Wan2.2-Fun-A14B-Control",
# FastVideo will automatically handle distributed setup
num_gpus=1,
use_fsdp_inference=False, # set to True if GPU is out of memory
dit_cpu_offload=True, # DiT need to be offloaded for MoE
vae_cpu_offload=False,
text_encoder_cpu_offload=True,
# Set pin_cpu_memory to false if CPU RAM is limited and there're no frequent CPU-GPU transfer
pin_cpu_memory=True,
# image_encoder_cpu_offload=False,
)
prompt = "一位年轻女性穿着一件粉色的连衣裙,裙子上有白色的装饰和粉色的纽扣。她的头发是紫色的,头上戴着一个红色的大蝴蝶结,显得非常可爱和精致。她还戴着一个红色的领结,整体造型充满了少女感和活力。她的表情温柔,双手轻轻交叉放在身前,姿态优雅。背景是简单的灰色,没有任何多余的装饰,使得人物更加突出。她的妆容清淡自然,突显了她的清新气质。整体画面给人一种甜美、梦幻的感觉,仿佛置身于童话世界中。"
negative_prompt = "色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走"
# prompt = "A young woman with beautiful, clear eyes and blonde hair stands in the forest, wearing a white dress and a crown. Her expression is serene, reminiscent of a movie star, with fair and youthful skin. Her brown long hair flows in the wind. The video quality is very high, with a clear view. High quality, masterpiece, best quality, high resolution, ultra-fine, fantastical."
# negative_prompt = "Twisted body, limb deformities, text captions, comic, static, ugly, error, messy code."
image_path = "https://pai-aigc-photog.oss-cn-hangzhou.aliyuncs.com/wan_fun/asset_Wan2_2/v1.0/8.png"
control_video_path = "https://pai-aigc-photog.oss-cn-hangzhou.aliyuncs.com/wan_fun/asset_Wan2_2/v1.0/pose.mp4"
video = generator.generate_video(prompt, negative_prompt=negative_prompt, image_path=image_path, video_path=control_video_path, output_path=OUTPUT_PATH, output_video_name=OUTPUT_NAME, save_video=True)
if __name__ == "__main__":
main()
basic_wan2_2_i2v.py
from fastvideo import VideoGenerator
# from fastvideo.api.sampling_param import SamplingParam
OUTPUT_PATH = "video_samples_wan2_2_14B_i2v"
def main():
# FastVideo will automatically use the optimal default arguments for the
# model.
# If a local path is provided, FastVideo will make a best effort
# attempt to identify the optimal arguments.
generator = VideoGenerator.from_pretrained(
"Wan-AI/Wan2.2-I2V-A14B-Diffusers",
# FastVideo will automatically handle distributed setup
num_gpus=1,
use_fsdp_inference=False, # set to True if GPU is out of memory
dit_cpu_offload=True, # DiT need to be offloaded for MoE
vae_cpu_offload=False,
text_encoder_cpu_offload=True,
# Set pin_cpu_memory to false if CPU RAM is limited and there're no frequent CPU-GPU transfer
pin_cpu_memory=True,
# image_encoder_cpu_offload=False,
)
prompt = "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside."
image_path = "https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/wan_i2v_input.JPG"
video = generator.generate_video(prompt, image_path=image_path, output_path=OUTPUT_PATH, save_video=True, height=832, width=480, num_frames=81)
if __name__ == "__main__":
main()
basic_wan2_2_ti2v.py
from fastvideo import VideoGenerator
OUTPUT_PATH = "video_samples_wan2_2_5B_ti2v"
def main():
# FastVideo will automatically use the optimal default arguments for the
# model.
# If a local path is provided, FastVideo will make a best effort
# attempt to identify the optimal arguments.
model_name = "Wan-AI/Wan2.2-TI2V-5B-Diffusers"
generator = VideoGenerator.from_pretrained(
model_name,
# FastVideo will automatically handle distributed setup
num_gpus=1,
use_fsdp_inference=False, # set to True if GPU is out of memory
dit_cpu_offload=True,
vae_cpu_offload=False,
text_encoder_cpu_offload=True,
pin_cpu_memory=True, # set to false if low CPU RAM or hit obscure "CUDA error: Invalid argument"
# image_encoder_cpu_offload=False,
)
# I2V is triggered just by passing in an image_path argument
prompt = "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside."
image_path = "https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/wan_i2v_input.JPG"
video = generator.generate_video(prompt, output_path=OUTPUT_PATH, save_video=True, image_path=image_path)
# Generate another video with a different prompt, without reloading the
# model!
# T2V mode
prompt2 = (
"A majestic lion strides across the golden savanna, its powerful frame "
"glistening under the warm afternoon sun. The tall grass ripples gently in "
"the breeze, enhancing the lion's commanding presence. The tone is vibrant, "
"embodying the raw energy of the wild. Low angle, steady tracking shot, "
"cinematic.")
video2 = generator.generate_video(prompt2, output_path=OUTPUT_PATH, save_video=True)
if __name__ == "__main__":
main()