Source: examples/inference/basic

Basic Video Generation Tutorial¶

The VideoGenerator class provides the primary Python interface for doing offline video generation, which is interacting with a diffusion pipeline without using a separate inference api server.

Requirements¶

At least a single NVIDIA GPU with CUDA 12.4.
Python 3.10-3.12

Installation¶

If you have not installed FastVideo, please following these instructions first.

Usage¶

The first script in this example shows the most basic usage of FastVideo. If you are new to Python and FastVideo, you should start here.

# if you have not cloned the directory:
git clone https://github.com/hao-ai-lab/FastVideo.git && cd FastVideo

python examples/inference/basic/basic.py

For an example on Apple silicon:

python examples/inference/basic/basic_mps.py

For an example running DMD+VSA inference:

python examples/inference/basic/basic_dmd.py

For the typed config/request path added during the inference API refactor:

python examples/inference/basic/basic_dmd_new_api.py

Basic Walkthrough¶

All you need to generate videos using multi-gpus from state-of-the-art diffusion pipelines is the following few lines!

from fastvideo import VideoGenerator

def main():
    generator = VideoGenerator.from_pretrained(
        "Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
        num_gpus=1,
    )

    prompt = ("A curious raccoon peers through a vibrant field of yellow sunflowers, its eyes "
             "wide with interest. The playful yet serene atmosphere is complemented by soft "
             "natural light filtering through the petals. Mid-shot, warm and cheerful tones.")
    video = generator.generate_video(prompt)

if __name__ == "__main__":
    main()

Additional Files¶

basic.py

from fastvideo import VideoGenerator

# from fastvideo.api.sampling_param import SamplingParam

OUTPUT_PATH = "video_samples"
def main():
    # FastVideo will automatically use the optimal default arguments for the
    # model.
    # If a local path is provided, FastVideo will make a best effort
    # attempt to identify the optimal arguments.
    generator = VideoGenerator.from_pretrained(
        "Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
        # FastVideo will automatically handle distributed setup
        num_gpus=1,
        use_fsdp_inference=False, # set to True if GPU is out of memory
        dit_cpu_offload=False,
        vae_cpu_offload=False,
        text_encoder_cpu_offload=True,
        pin_cpu_memory=True, # set to false if low CPU RAM or hit obscure "CUDA error: Invalid argument"
        # image_encoder_cpu_offload=False,
    )

    # sampling_param = SamplingParam.from_pretrained("Wan-AI/Wan2.1-T2V-1.3B-Diffusers")
    # sampling_param.num_frames = 45
    # sampling_param.image_path = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/astronaut.jpg"
    # Generate videos with the same simple API, regardless of GPU count
    prompt = (
        "A curious raccoon peers through a vibrant field of yellow sunflowers, its eyes "
        "wide with interest. The playful yet serene atmosphere is complemented by soft "
        "natural light filtering through the petals. Mid-shot, warm and cheerful tones."
    )
    video = generator.generate_video(prompt, output_path=OUTPUT_PATH, save_video=True)
    # video = generator.generate_video(prompt, sampling_param=sampling_param, output_path="wan_t2v_videos/")

    # Generate another video with a different prompt, without reloading the
    # model!
    prompt2 = (
        "A majestic lion strides across the golden savanna, its powerful frame "
        "glistening under the warm afternoon sun. The tall grass ripples gently in "
        "the breeze, enhancing the lion's commanding presence. The tone is vibrant, "
        "embodying the raw energy of the wild. Low angle, steady tracking shot, "
        "cinematic.")
    video2 = generator.generate_video(prompt2, output_path=OUTPUT_PATH, save_video=True)


if __name__ == "__main__":
    main()

basic_cosmos2_5_i2w.py

# SPDX-License-Identifier: Apache-2.0
from fastvideo import VideoGenerator
from fastvideo.api.sampling_param import SamplingParam


def main():
    # Point this to your local diffusers model dir (or replace with a HF model ID).
    model_path = "KyleShao/Cosmos-Predict2.5-2B-Diffusers"

    generator = VideoGenerator.from_pretrained(
        model_path,
        num_gpus=1,
        use_fsdp_inference=False,  # set True if GPU is out of memory
        dit_cpu_offload=False,
        vae_cpu_offload=False,
        text_encoder_cpu_offload=True,
        pin_cpu_memory=True,
    )

    sampling_param = SamplingParam.from_pretrained(model_path)

    # image2world example from official repo
    image_path = "assets/images/bus_terminal.jpg"

    prompt = (
        "A nighttime city bus terminal gradually shifts from stillness to subtle movement. "
        "At first, multiple double-decker buses are parked under the glow of overhead lights, "
        "with a central bus labeled '87D' facing forward and stationary. "
        "As the video progresses, the bus in the middle moves ahead slowly, its headlights brightening the surrounding area "
        "and casting reflections onto adjacent vehicles. "
        "The motion creates space in the lineup, signaling activity within the otherwise quiet station. "
        "It then comes to a smooth stop, resuming its position in line. "
        "Overhead signage in Chinese characters remains illuminated, enhancing the vibrant, urban night scene."
    )

    generator.generate_video(
        prompt,
        sampling_param=sampling_param,
        image_path=str(image_path),
        num_cond_frames=1,
        output_path="outputs_video/cosmos2_5_i2w.mp4",
        save_video=True,
    )

    generator.shutdown()


if __name__ == "__main__":
    main()

basic_cosmos2_5_t2w.py

from fastvideo import VideoGenerator
from fastvideo.api.sampling_param import SamplingParam


def main():
    # Point this to your local diffusers model dir (or replace with a HF model ID).
    model_path = "KyleShao/Cosmos-Predict2.5-2B-Diffusers"

    generator = VideoGenerator.from_pretrained(
        model_path,
        num_gpus=1,
        use_fsdp_inference=False,  # set True if GPU is out of memory
        dit_cpu_offload=False,
        vae_cpu_offload=False,
        text_encoder_cpu_offload=True,
        pin_cpu_memory=True,
    )

    # Load default sampling parameters (negative_prompt, resolution, steps, etc.)
    sampling_param = SamplingParam.from_pretrained(model_path)

    prompt = (
        "A high-definition video captures the precision of robotic welding in an industrial setting. "
        "The first frame showcases a robotic arm, equipped with a welding torch, positioned over a large metal structure. "
        "The welding process is in full swing, with bright sparks and intense light illuminating the scene, "
        "creating a vivid display of blue and white hues. "
        "A significant amount of smoke billows around the welding area, partially obscuring the view but emphasizing the heat and activity. "
        "The background reveals parts of the workshop environment, including a ventilation system and various pieces of machinery, "
        "indicating a busy and functional industrial workspace. "
        "As the video progresses, the robotic arm maintains its steady position, continuing the welding process and moving to its left. "
        "The welding torch consistently emits sparks and light, and the smoke continues to rise, diffusing slightly as it moves upward. "
        "The metal surface beneath the torch shows ongoing signs of heating and melting. "
        "The scene retains its industrial ambiance, with the welding sparks and smoke dominating the visual field, "
        "underscoring the ongoing nature of the welding operation."
    )

    generator.generate_video(
        prompt,
        sampling_param=sampling_param,
        output_path="outputs_video/cosmos2_5_t2w.mp4",
        save_video=True,
    )

    generator.shutdown()


if __name__ == "__main__":
    main()

basic_cosmos2_5_v2w.py

# SPDX-License-Identifier: Apache-2.0
from fastvideo import VideoGenerator
from fastvideo.api.sampling_param import SamplingParam


def main():
    # Point this to your local diffusers model dir (or replace with a HF model ID).
    model_path = "KyleShao/Cosmos-Predict2.5-2B-Diffusers"

    generator = VideoGenerator.from_pretrained(
        model_path,
        num_gpus=1,
        use_fsdp_inference=False,  # set True if GPU is out of memory
        dit_cpu_offload=False,
        vae_cpu_offload=False,
        text_encoder_cpu_offload=True,
        pin_cpu_memory=True,
    )

    sampling_param = SamplingParam.from_pretrained(model_path)

    # video2world example from official repo
    video_path = "assets/videos/robot_pouring.mp4"

    prompt = (
        "A robotic arm, primarily white with black joints and cables, is shown in a clean, modern indoor setting with a white tabletop. "
        "The arm, equipped with a gripper holding a small, light green pitcher, is positioned above a clear glass containing a reddish-brown liquid and a spoon. "
        "The robotic arm is in the process of pouring a transparent liquid into the glass. "
        "To the left of the pitcher, there is an opened jar with a similar reddish-brown substance visible through its transparent body. "
        "In the background, a vase with white flowers and a brown couch are partially visible, adding to the contemporary ambiance. "
        "The lighting is bright, casting soft shadows on the table. "
        "The robotic arm's movements are smooth and controlled, demonstrating precision in its task. "
        "As the video progresses, the robotic arm completes the pour, leaving the glass half-filled with the reddish-brown liquid. "
        "The jar remains untouched throughout the sequence, and the spoon inside the glass remains stationary. "
        "The other robotic arm on the right side also stays stationary throughout the video. "
        "The final frame captures the robotic arm with the pitcher finishing the pour, with the glass now filled to a higher level, while the pitcher is slightly tilted but still held securely by the gripper."
    )

    generator.generate_video(
        prompt,
        sampling_param=sampling_param,
        video_path=str(video_path),
        num_cond_frames=1,
        output_path="outputs_video/cosmos2_5_v2w.mp4",
        save_video=True,
    )

    generator.shutdown()


if __name__ == "__main__":
    main()

basic_dmd.py

import os
import time
from fastvideo import VideoGenerator

from fastvideo.api.sampling_param import SamplingParam

OUTPUT_PATH = "video_samples_dmd2"
def main():
    os.environ["FASTVIDEO_ATTENTION_BACKEND"] = "VIDEO_SPARSE_ATTN"

    load_start_time = time.perf_counter()
    model_name = "FastVideo/FastWan2.1-T2V-1.3B-Diffusers"
    generator = VideoGenerator.from_pretrained(
        model_name,
        # FastVideo will automatically handle distributed setup
        num_gpus=1,
        use_fsdp_inference=False, # set to True if GPU is out of memory
        # Adjust these offload parameters if you have < 32GB of VRAM
        text_encoder_cpu_offload=True,
        pin_cpu_memory=True, # set to false if low CPU RAM or hit obscure "CUDA error: Invalid argument"
        dit_cpu_offload=False,
        vae_cpu_offload=False,
        VSA_sparsity=0.8,
    )
    load_end_time = time.perf_counter()
    load_time = load_end_time - load_start_time


    sampling_param = SamplingParam.from_pretrained(model_name)
    sampling_param.num_frames = 81

    prompt = (
        "A neon-lit alley in futuristic Tokyo during a heavy rainstorm at night. The puddles reflect glowing signs in kanji, advertising ramen, karaoke, and VR arcades. A woman in a translucent raincoat walks briskly with an LED umbrella. Steam rises from a street food cart, and a cat darts across the screen. Raindrops are visible on the camera lens, creating a cinematic bokeh effect."
    )
    start_time = time.perf_counter()
    video = generator.generate_video(prompt, output_path=OUTPUT_PATH, save_video=True, sampling_param=sampling_param)
    end_time = time.perf_counter()
    gen_time = end_time - start_time

    # Generate another video with a different prompt, without reloading the
    # model!
    prompt2 = (
        "A majestic lion strides across the golden savanna, its powerful frame "
        "glistening under the warm afternoon sun. The tall grass ripples gently in "
        "the breeze, enhancing the lion's commanding presence. The tone is vibrant, "
        "embodying the raw energy of the wild. Low angle, steady tracking shot, "
        "cinematic.")
    start_time = time.perf_counter()
    video2 = generator.generate_video(prompt2, output_path=OUTPUT_PATH, save_video=True, num_frames=81)
    end_time = time.perf_counter()
    gen_time2 = end_time - start_time


    print(f"Time taken to load model: {load_time} seconds")
    print(f"Time taken to generate video: {gen_time} seconds")
    print(f"Time taken to generate video2: {gen_time2} seconds")


if __name__ == "__main__":
    main()

basic_dmd_new_api.py

import os
import time

from fastvideo import VideoGenerator
from fastvideo.api import (
    EngineConfig,
    GenerationRequest,
    GeneratorConfig,
    OffloadConfig,
    OutputConfig,
    PipelineSelection,
)

OUTPUT_PATH = "video_samples_dmd2_typed"


def main():
    os.environ["FASTVIDEO_ATTENTION_BACKEND"] = "VIDEO_SPARSE_ATTN"

    model_name = "FastVideo/FastWan2.1-T2V-1.3B-Diffusers"
    generator_config = GeneratorConfig(
        model_path=model_name,
        engine=EngineConfig(
            num_gpus=1,
            use_fsdp_inference=False,
            offload=OffloadConfig(
                text_encoder=True,
                pin_cpu_memory=True,
                dit=False,
                vae=False,
            ),
        ),
        # PR 2 still routes a few advanced inference knobs through the
        # compatibility bridge until they get first-class typed fields.
        pipeline=PipelineSelection(
            experimental={
                "VSA_sparsity": 0.8,
            },
        ),
    )

    load_start_time = time.perf_counter()
    generator = VideoGenerator.from_config(generator_config)
    load_end_time = time.perf_counter()
    load_time = load_end_time - load_start_time

    prompt = (
        "A neon-lit alley in futuristic Tokyo during a heavy rainstorm at night. "
        "The puddles reflect glowing signs in kanji, advertising ramen, karaoke, "
        "and VR arcades. A woman in a translucent raincoat walks briskly with an "
        "LED umbrella. Steam rises from a street food cart, and a cat darts "
        "across the screen. Raindrops are visible on the camera lens, creating "
        "a cinematic bokeh effect."
    )
    request = GenerationRequest(
        prompt=prompt,
        output=OutputConfig(
            output_path=OUTPUT_PATH,
            save_video=True,
            return_frames=False,
        ),
    )

    start_time = time.perf_counter()
    result = generator.generate(request)
    end_time = time.perf_counter()
    gen_time = end_time - start_time

    prompt2 = (
        "A majestic lion strides across the golden savanna, its powerful frame "
        "glistening under the warm afternoon sun. The tall grass ripples gently "
        "in the breeze, enhancing the lion's commanding presence. The tone is "
        "vibrant, embodying the raw energy of the wild. Low angle, steady "
        "tracking shot, cinematic."
    )
    request2 = GenerationRequest(
        prompt=prompt2,
        output=OutputConfig(
            output_path=OUTPUT_PATH,
            save_video=True,
            return_frames=False,
        ),
    )

    start_time = time.perf_counter()
    result2 = generator.generate(request2)
    end_time = time.perf_counter()
    gen_time2 = end_time - start_time

    print(f"Time taken to load model: {load_time} seconds")
    print(f"Time taken to generate video: {gen_time} seconds")
    print(f"First output written to: {result.video_path}")
    print(f"Time taken to generate video2: {gen_time2} seconds")
    print(f"Second output written to: {result2.video_path}")


if __name__ == "__main__":
    main()

basic_dreamx_world.py

import os

from fastvideo import VideoGenerator


OUTPUT_PATH = os.getenv("DREAMX_WORLD_OUTPUT_PATH", "video_samples_dreamx_world")


def _env_int(name: str, default: int) -> int:
    return int(os.getenv(name, str(default)))


def _env_float(name: str, default: float) -> float:
    return float(os.getenv(name, str(default)))


def main():
    model_name = os.getenv("DREAMX_WORLD_MODEL_DIR", "FastVideo/DreamX-World-5B-Cam-Diffusers")
    generator = VideoGenerator.from_pretrained(
        model_name,
        num_gpus=1,
        use_fsdp_inference=False,
        dit_cpu_offload=False,
        vae_cpu_offload=True,
        text_encoder_cpu_offload=True,
        pin_cpu_memory=False,
        override_pipeline_cls_name="DreamXWorldPipeline",
    )

    prompt = os.getenv(
        "DREAMX_WORLD_PROMPT",
        "A cinematic first-person drive through a futuristic coastal city at "
        "sunrise, reflective glass towers, clean streets, soft volumetric light.",
    )
    image_path = os.getenv(
        "DREAMX_WORLD_IMAGE_PATH",
        "https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/wan_i2v_input.JPG",
    )

    kwargs = {
        "output_path": OUTPUT_PATH,
        "save_video": os.getenv("DREAMX_WORLD_SAVE_VIDEO", "1") != "0",
        "height": _env_int("DREAMX_WORLD_HEIGHT", 480),
        "width": _env_int("DREAMX_WORLD_WIDTH", 832),
        "num_frames": _env_int("DREAMX_WORLD_NUM_FRAMES", 161),
        "num_inference_steps": _env_int("DREAMX_WORLD_STEPS", 30),
        "guidance_scale": _env_float("DREAMX_WORLD_GUIDANCE", 5.0),
        "action_list": os.getenv("DREAMX_WORLD_ACTIONS", "w,d,w").split(","),
        "action_speed_list": [
            float(value)
            for value in os.getenv("DREAMX_WORLD_ACTION_SPEEDS", "4.0,2.0,4.0").split(",")
        ],
    }
    if image_path:
        kwargs["image_path"] = image_path

    try:
        generator.generate_video(prompt, **kwargs)
    finally:
        generator.shutdown()


if __name__ == "__main__":
    main()

basic_flux2.py

# SPDX-License-Identifier: Apache-2.0
"""Run full Flux2 text-to-image generation through FastVideo.

User story:
    "I have a local or HF Diffusers-format full Flux2 checkpoint and want a
    minimal text-to-image generation command that uses embedded guidance."
"""
import argparse
import os
from pathlib import Path

from fastvideo import VideoGenerator
from fastvideo.api import (
    ComponentConfig,
    EngineConfig,
    GenerationRequest,
    GeneratorConfig,
    OffloadConfig,
    OutputConfig,
    ParallelismConfig,
    PipelineSelection,
    SamplingConfig,
)


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Run full Flux2 text-to-image generation.")
    parser.add_argument(
        "--model-path",
        default="black-forest-labs/FLUX.2-dev",
        help="HF id or local diffusers-format full Flux2 weights directory.",
    )
    parser.add_argument(
        "--output",
        default="outputs/flux2/flux2.png",
        help="Output PNG path.",
    )
    parser.add_argument(
        "--prompt",
        default="a photo of a banana on a wooden table, studio lighting",
        help="Text prompt.",
    )
    parser.add_argument("--height", type=int, default=1024)
    parser.add_argument("--width", type=int, default=1024)
    parser.add_argument("--steps", type=int, default=50)
    parser.add_argument("--guidance-scale", type=float, default=4.0)
    parser.add_argument("--max-sequence-length", type=int, default=None)
    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument("--num-gpus", type=int, default=1)
    parser.add_argument("--tp-size", type=int, default=None)
    parser.add_argument("--sp-size", type=int, default=None)
    parser.add_argument(
        "--backend",
        default=None,
        help="Set FASTVIDEO_ATTENTION_BACKEND, for example TORCH_SDPA.",
    )
    return parser.parse_args()


def main() -> None:
    args = parse_args()
    if args.backend:
        os.environ["FASTVIDEO_ATTENTION_BACKEND"] = args.backend

    output = Path(args.output)
    output.parent.mkdir(parents=True, exist_ok=True)
    tp_size = args.tp_size if args.tp_size is not None else (
        args.num_gpus if args.num_gpus > 1 else 1
    )
    sp_size = args.sp_size if args.sp_size is not None else (
        1 if args.num_gpus > 1 else args.num_gpus
    )

    generator_config = GeneratorConfig(
        model_path=args.model_path,
        engine=EngineConfig(
            num_gpus=args.num_gpus,
            parallelism=ParallelismConfig(tp_size=tp_size, sp_size=sp_size),
            use_fsdp_inference=False,
            offload=OffloadConfig(
                dit=False,
                vae=True,
                text_encoder=True,
                pin_cpu_memory=False,
            ),
        ),
        pipeline=PipelineSelection(
            workload_type="t2i",
            components=ComponentConfig(override_pipeline_cls_name="Flux2Pipeline"),
        ),
    )

    generator = VideoGenerator.from_config(generator_config)
    try:
        sampling = SamplingConfig(
            height=args.height,
            width=args.width,
            num_frames=1,
            fps=1,
            num_inference_steps=args.steps,
            guidance_scale=args.guidance_scale,
            seed=args.seed,
        )
        extensions = {}
        if args.max_sequence_length is not None:
            extensions["max_sequence_length"] = args.max_sequence_length

        request = GenerationRequest(
            prompt=args.prompt,
            sampling=sampling,
            output=OutputConfig(
                output_path=str(output),
                save_video=True,
                return_frames=False,
            ),
            extensions=extensions,
        )
        generator.generate(request)
    finally:
        generator.shutdown()


if __name__ == "__main__":
    main()

basic_flux2_klein.py

# SPDX-License-Identifier: Apache-2.0
"""Run Flux2 Klein text-to-image generation through FastVideo.

User story:
    "I need a short local smoke for the Flux2 Klein checkpoint before wiring it
    into an image workflow. Use the model's distilled four-step defaults and
    write a single PNG so I can compare the output against the reference."
"""

import argparse
import os

from fastvideo import VideoGenerator
from fastvideo.api import (
    EngineConfig,
    GenerationRequest,
    GeneratorConfig,
    OffloadConfig,
    OutputConfig,
    PipelineSelection,
    SamplingConfig,
)


DEFAULT_PROMPT = "a brushed steel espresso machine on a marble counter, morning window light"


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Run Flux2 Klein text-to-image generation.")
    parser.add_argument(
        "--model-path",
        default="black-forest-labs/FLUX.2-klein-4B",
        help="HF id or local diffusers-format Flux2 Klein weights directory.",
    )
    parser.add_argument(
        "--output-path",
        default="outputs/flux2/flux2_klein.png",
        help="PNG output path or output directory.",
    )
    parser.add_argument("--prompt", default=DEFAULT_PROMPT, help="Prompt text.")
    parser.add_argument("--seed", type=int, default=0, help="Generation seed.")
    parser.add_argument("--height", type=int, default=1024, help="Output image height.")
    parser.add_argument("--width", type=int, default=1024, help="Output image width.")
    parser.add_argument("--steps", type=int, default=4, help="Number of denoising steps.")
    parser.add_argument("--num-gpus", type=int, default=1, help="Number of GPUs to use.")
    parser.add_argument(
        "--backend",
        default=None,
        help="Set FASTVIDEO_ATTENTION_BACKEND, for example TORCH_SDPA.",
    )
    return parser.parse_args()


def main() -> None:
    args = parse_args()
    if args.backend:
        os.environ["FASTVIDEO_ATTENTION_BACKEND"] = args.backend

    generator_config = GeneratorConfig(
        model_path=args.model_path,
        engine=EngineConfig(
            num_gpus=args.num_gpus,
            use_fsdp_inference=False,
            offload=OffloadConfig(
                dit=False,
                vae=True,
                text_encoder=True,
                pin_cpu_memory=False,
            ),
        ),
        pipeline=PipelineSelection(workload_type="t2i"),
    )

    generator = VideoGenerator.from_config(generator_config)
    try:
        request = GenerationRequest(
            prompt=args.prompt,
            sampling=SamplingConfig(
                height=args.height,
                width=args.width,
                num_frames=1,
                fps=1,
                num_inference_steps=args.steps,
                guidance_scale=1.0,
                seed=args.seed,
            ),
            output=OutputConfig(
                output_path=args.output_path,
                save_video=True,
            ),
        )
        generator.generate(request)
    finally:
        generator.shutdown()


if __name__ == "__main__":
    main()

basic_flux_dev.py

# SPDX-License-Identifier: Apache-2.0

from __future__ import annotations

import argparse
import contextlib
import os
import re

DEFAULT_PROMPTS = [
    "a photo of a cat",
    (
        "a cinematic photo of a red panda wearing a tiny backpack, standing on a "
        "rainy neon-lit street at night, shallow depth of field, sharp focus, "
        "35mm, bokeh"
    ),
]


def _safe_filename(text: str, max_len: int = 100) -> str:
    """Make a stable, filesystem-friendly filename base."""
    s = text[:max_len].strip()
    s = s.replace(os.sep, "_")
    if os.altsep:
        s = s.replace(os.altsep, "_")
    s = re.sub(r"\s+", " ", s)
    s = re.sub(r"[^A-Za-z0-9 .,_-]", "_", s)
    s = s.strip(" .")
    return s or "prompt"


def _remove_existing_outputs(out_dir: str, filename_base: str) -> None:
    """Delete prior outputs so reruns do not get _1, _2 suffixes."""
    if not os.path.isdir(out_dir):
        return

    pattern = re.compile(rf"^{re.escape(filename_base)}(_\d+)?\.(mp4|png)$")
    for fn in os.listdir(out_dir):
        if pattern.match(fn):
            with contextlib.suppress(FileNotFoundError):
                os.remove(os.path.join(out_dir, fn))


def parse_args() -> argparse.Namespace:
    p = argparse.ArgumentParser(
        description="Run FLUX.1-dev text-to-image with FastVideo VideoGenerator.",
    )
    p.add_argument(
        "--model-path",
        default="official_weights/FLUX.1-dev",
        help="Local Diffusers checkpoint dir or HF repo id.",
    )
    p.add_argument(
        "--out-dir",
        "--outdir",
        default="outputs/flux_dev/samples",
        help="Directory for saved PNG outputs.",
    )
    p.add_argument(
        "--prompt",
        action="append",
        default=None,
        help="Prompt. Repeat for multiple images.",
    )
    p.add_argument(
        "--backend",
        default=None,
        help="Set FASTVIDEO_ATTENTION_BACKEND (e.g. TORCH_SDPA).",
    )
    p.add_argument("--seed", type=int, default=42, help="Base seed; each prompt uses seed + index.")
    p.add_argument("--height", type=int, default=1024, help="Output height.")
    p.add_argument("--width", type=int, default=1024, help="Output width.")
    p.add_argument("--steps", type=int, default=28, help="Number of inference steps.")
    p.add_argument("--guidance", type=float, default=3.5, help="Guidance scale.")
    p.add_argument("--num-gpus", type=int, default=1, help="GPU count.")
    return p.parse_args()


def main() -> None:
    args = parse_args()
    prompts: list[str] = args.prompt if args.prompt else DEFAULT_PROMPTS

    if args.backend:
        os.environ["FASTVIDEO_ATTENTION_BACKEND"] = args.backend

    from fastvideo import VideoGenerator

    os.makedirs(args.out_dir, exist_ok=True)

    init_kwargs = {
        "num_gpus": args.num_gpus,
        "workload_type": "t2i",
        "sp_size": 1,
        "tp_size": 1,
        "dit_cpu_offload": False,
        "dit_layerwise_offload": False,
        "text_encoder_cpu_offload": False,
        "vae_cpu_offload": False,
        "image_encoder_cpu_offload": False,
        "pin_cpu_memory": False,
        "use_fsdp_inference": False,
    }

    generator = VideoGenerator.from_pretrained(
        model_path=args.model_path,
        **init_kwargs,
    )
    try:
        for i, prompt in enumerate(prompts):
            seed = args.seed + i
            filename_base = (
                f"flux_dev_{i:02d}_seed{seed}_{_safe_filename(prompt, max_len=80)}"
            )
            _remove_existing_outputs(args.out_dir, filename_base)
            output_path = os.path.join(args.out_dir, f"{filename_base}.png")
            print(f"[flux] prompt_idx={i} seed={seed} output_path={output_path}")

            generation_kwargs = {
                "output_path": output_path,
                "height": args.height,
                "width": args.width,
                "num_frames": 1,
                "fps": 1,
                "num_inference_steps": args.steps,
                "guidance_scale": args.guidance,
                "use_embedded_guidance": True,
                "true_cfg_scale": 1.0,
                "seed": seed,
                "save_video": True,
            }

            generator.generate_video(prompt, **generation_kwargs)

        print(f"[flux] done. outputs written to: {args.out_dir}")
    finally:
        generator.shutdown()


if __name__ == "__main__":
    main()

basic_gamecraft.py

# SPDX-License-Identifier: Apache-2.0
"""
Basic inference script for HunyuanGameCraft video generation.

HunyuanGameCraft generates game-like videos with camera/action control.
It takes an optional image input and generates video with camera motion
based on simple action commands (forward, left, right, backward, rotations).

Available actions:
    - forward (w): Move camera forward
    - backward (s): Move camera backward  
    - left (a): Move camera left (strafe)
    - right (d): Move camera right (strafe)
    - left_rot: Rotate camera left (pan)
    - right_rot: Rotate camera right (pan)
    - up_rot: Rotate camera up (tilt)
    - down_rot: Rotate camera down (tilt)

T2V vs I2V:
    - Default: I2V (uses a default reference image). Set GAMECRAFT_I2V_IMAGE to a
      URL or path to use a different image.
    - T2V only (no reference image): run with GAMECRAFT_I2V_IMAGE= (empty).
"""
import os

import torch

from fastvideo import VideoGenerator
from fastvideo.models.camera import create_camera_trajectory

# Model configuration (use GAMECRAFT_MODEL_PATH for local weights)
MODEL_PATH = os.environ.get("GAMECRAFT_MODEL_PATH", "FastVideo/HunyuanGameCraft-Diffusers")

# Default prompts for demo
DEFAULT_PROMPTS = {
    "village": "A charming medieval village with cobblestone streets, thatched-roof houses, and vibrant flower gardens under a bright blue sky.",
    "temple": "A majestic ancient temple stands under a clear blue sky, its grandeur highlighted by towering Doric columns and intricate architectural details.",
    "forest": "A lush green forest with tall trees, dappled sunlight filtering through the leaves, and a winding dirt path.",
    "beach": "A tropical beach with crystal clear turquoise water, white sand, and palm trees swaying in the breeze.",
}

# I2V: default reference image (URL). Can override with a local path.
DEFAULT_I2V_IMAGE_URL = (
    "https://huggingface.co/datasets/huggingface/documentation-images/"
    "resolve/main/diffusers/astronaut.jpg"
)
DEFAULT_I2V_PROMPT = (
    "An astronaut hatching from an egg, on the surface of the moon, "
    "the darkness and depth of space realised in the background."
)

OUTPUT_PATH = "video_samples_gamecraft"


def main():
    # Initialize generator
    # FastVideo will automatically download weights from HuggingFace
    generator = VideoGenerator.from_pretrained(
        MODEL_PATH,
        num_gpus=1,
        use_fsdp_inference=True,
        dit_cpu_offload=True,
        vae_cpu_offload=True,
        text_encoder_cpu_offload=True,
        pin_cpu_memory=True,
    )

    # Video parameters
    height = 704
    width = 1280
    num_frames = 33
    action = "forward"
    action_speed = 0.2

    # Create camera trajectory (Plücker coordinates)
    camera_states = create_camera_trajectory(
        action=action,
        height=height,
        width=width,
        num_frames=num_frames,
        action_speed=action_speed,
        dtype=torch.bfloat16,
    )
    print(f"Camera states shape: {camera_states.shape}")

    # I2V vs T2V: unset GAMECRAFT_I2V_IMAGE -> I2V (default image). Set to "" -> T2V.
    env_image = os.environ.get("GAMECRAFT_I2V_IMAGE")
    if env_image is None:
        image_path = DEFAULT_I2V_IMAGE_URL  # default: I2V
    elif env_image.strip() == "":
        image_path = None  # T2V
    else:
        image_path = env_image.strip()  # I2V with given URL/path

    is_i2v = image_path is not None
    prompt = DEFAULT_I2V_PROMPT if is_i2v else DEFAULT_PROMPTS["temple"]
    print(f"Mode: {'I2V' if is_i2v else 'T2V'}, prompt: {prompt[:60]}...")

    gen_kw = dict(
        prompt=prompt,
        negative_prompt="",
        camera_states=camera_states,
        height=height,
        width=width,
        num_frames=num_frames,
        num_inference_steps=50,
        guidance_scale=6.0,
        seed=42,
        fps=24,
        output_path=OUTPUT_PATH,
        save_video=True,
    )
    if is_i2v:
        gen_kw["image_path"] = image_path
    generator.generate_video(**gen_kw)


if __name__ == "__main__":
    main()

basic_gen3c.py

"""
GEN3C: 3D-aware camera-controlled video generation.

This example generates a video from a single input image with camera control.
The pipeline uses MoGe depth estimation, 3D point cloud forward warping,
and the GEN3C diffusion model.

Requirements:
  1. Install MoGe:
     uv pip install git+https://github.com/microsoft/MoGe.git
     If you hit `ImportError: libGL.so.1`, install:
     sudo apt-get update && sudo apt-get install -y libgl1 libglib2.0-0 libsm6 libxext6 libxrender1
  2. Download and convert weights:
     huggingface-cli download nvidia/GEN3C-Cosmos-7B --local-dir official_weights/GEN3C-Cosmos-7B
     python scripts/checkpoint_conversion/convert_gen3c_to_fastvideo.py \
       --source ./official_weights/GEN3C-Cosmos-7B/model.pt \
       --output ./converted_weights/GEN3C-Cosmos-7B \
       --components-source nvidia/Cosmos-Predict2-2B-Video2World
  3. Provide an input image for 3D-conditioned generation.
"""

import argparse

from fastvideo import VideoGenerator


def main():
    parser = argparse.ArgumentParser(description="GEN3C video generation")
    parser.add_argument("--model_path",
                        type=str,
                        default="converted_weights/GEN3C-Cosmos-7B")
    parser.add_argument("--image_path",
                        type=str,
                        default=None,
                        help="Input image for 3D cache conditioning")
    parser.add_argument("--prompt",
                        type=str,
                        default="A slow camera pan over a sunlit landscape.")
    parser.add_argument(
        "--negative_prompt",
        type=str,
        default=(
            "The video captures a series of frames showing ugly scenes, static with no motion, motion blur, "
            "over-saturation, shaky footage, low resolution, grainy texture, pixelated images, poorly lit areas, "
            "underexposed and overexposed scenes, poor color balance, washed out colors, choppy sequences, "
            "jerky movements, low frame rate, artifacting, color banding, unnatural transitions, outdated special "
            "effects, fake elements, unconvincing visuals, poorly edited content, jump cuts, visual noise, and "
            "flickering. Overall, the video is of poor quality."
        ),
    )
    parser.add_argument("--trajectory",
                        type=str,
                        default="left",
                        choices=[
                            "left", "right", "up", "down", "zoom_in",
                            "zoom_out", "clockwise", "counterclockwise", "none"
                        ])
    parser.add_argument("--movement_distance", type=float, default=0.3)
    parser.add_argument("--camera_rotation",
                        type=str,
                        default="center_facing",
                        choices=[
                            "center_facing", "no_rotation",
                            "trajectory_aligned"
                        ])
    parser.add_argument("--height", type=int, default=704)
    parser.add_argument("--width", type=int, default=1280)
    parser.add_argument("--num_frames", type=int, default=121)
    parser.add_argument("--num_inference_steps", type=int, default=35)
    parser.add_argument("--guidance_scale", type=float, default=1.0)
    parser.add_argument("--output_path",
                        type=str,
                        default="outputs_video/gen3c.mp4")
    parser.add_argument("--seed", type=int, default=42)
    args = parser.parse_args()

    generator = VideoGenerator.from_pretrained(
        args.model_path,
        num_gpus=1,
        use_fsdp_inference=False,
        dit_cpu_offload=False,
        vae_cpu_offload=True,
        text_encoder_cpu_offload=True,
        pin_cpu_memory=True,
    )

    video = generator.generate_video(
        args.prompt,
        negative_prompt=args.negative_prompt,
        image_path=args.image_path,
        trajectory_type=args.trajectory,
        movement_distance=args.movement_distance,
        camera_rotation=args.camera_rotation,
        height=args.height,
        width=args.width,
        num_frames=args.num_frames,
        num_inference_steps=args.num_inference_steps,
        guidance_scale=args.guidance_scale,
        fps=24,
        seed=args.seed,
        output_path=args.output_path,
        save_video=True,
    )

    generator.shutdown()


if __name__ == "__main__":
    main()

basic_glm_image.py

# SPDX-License-Identifier: Apache-2.0
"""Run GLM-Image text-to-image generation through FastVideo.

User story:
    "I have the HF `zai-org/GLM-Image` checkpoint and want a minimal
    text-to-image generation command, saved as a PNG."
"""
import argparse
from pathlib import Path

from PIL import Image

from fastvideo import VideoGenerator
from fastvideo.api import (
    EngineConfig,
    GenerationRequest,
    GeneratorConfig,
    OutputConfig,
    ParallelismConfig,
    PipelineSelection,
    SamplingConfig,
)


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Run GLM-Image text-to-image generation.")
    parser.add_argument(
        "--model-path",
        default="zai-org/GLM-Image",
        help="HF id or local diffusers-format GLM-Image weights directory.",
    )
    parser.add_argument(
        "--output",
        default="image_output/landscape.png",
        help="Output PNG path.",
    )
    parser.add_argument(
        "--prompt",
        default=("A beautiful landscape photography with rolling hills, "
                 "a winding river, and a vibrant sunset in the background. "
                 "Warm golden light, photorealistic style."),
        help="Text prompt.",
    )
    parser.add_argument("--height", type=int, default=1024)
    parser.add_argument("--width", type=int, default=1024)
    parser.add_argument("--steps", type=int, default=50)
    parser.add_argument("--guidance-scale", type=float, default=1.5)
    parser.add_argument("--seed", type=int, default=1024)
    parser.add_argument("--num-gpus", type=int, default=1)
    parser.add_argument("--tp-size", type=int, default=None)
    parser.add_argument("--sp-size", type=int, default=None)
    return parser.parse_args()


def main() -> None:
    args = parse_args()

    output = Path(args.output)
    output.parent.mkdir(parents=True, exist_ok=True)
    tp_size = args.tp_size if args.tp_size is not None else (args.num_gpus if args.num_gpus > 1 else 1)
    sp_size = args.sp_size if args.sp_size is not None else (1 if args.num_gpus > 1 else args.num_gpus)

    # GLM-Image needs trust_remote_code for its AR encoder; offload and the
    # pipeline class come from the model's registered defaults — don't override.
    generator_config = GeneratorConfig(
        model_path=args.model_path,
        trust_remote_code=True,
        engine=EngineConfig(
            num_gpus=args.num_gpus,
            parallelism=ParallelismConfig(tp_size=tp_size, sp_size=sp_size),
        ),
        pipeline=PipelineSelection(workload_type="t2i"),
    )

    generator = VideoGenerator.from_config(generator_config)
    try:
        request = GenerationRequest(
            prompt=args.prompt,
            sampling=SamplingConfig(
                height=args.height,
                width=args.width,
                num_frames=1,
                fps=1,
                num_inference_steps=args.steps,
                guidance_scale=args.guidance_scale,
                seed=args.seed,
            ),
            output=OutputConfig(
                output_path=str(output.parent),
                save_video=False,
                return_frames=True,
            ),
        )
        result = generator.generate(request)
        if isinstance(result, list):
            result = result[0]

        frames = result.frames
        if frames is not None and len(frames):
            Image.fromarray(frames[0]).save(output)
            print(f"Saved image to {output}")
    finally:
        generator.shutdown()


if __name__ == "__main__":
    main()

basic_hy15.py

from fastvideo import VideoGenerator
import json
# from fastvideo.api.sampling_param import SamplingParam

OUTPUT_PATH = "video_samples_hy15"
def main():
    # FastVideo will automatically use the optimal default arguments for the
    # model.
    # If a local path is provided, FastVideo will make a best effort
    # attempt to identify the optimal arguments.
    generator = VideoGenerator.from_pretrained(
        "hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-480p_t2v",
        # FastVideo will automatically handle distributed setup
        num_gpus=1,
        use_fsdp_inference=False, # set to True if GPU is out of memory
        dit_cpu_offload=True,
        vae_cpu_offload=True,
        text_encoder_cpu_offload=True,
        pin_cpu_memory=True, # set to false if low CPU RAM or hit obscure "CUDA error: Invalid argument"
        # image_encoder_cpu_offload=False,
    )

    prompt = (
        "A curious raccoon peers through a vibrant field of yellow sunflowers, its eyes "
        "wide with interest. The playful yet serene atmosphere is complemented by soft "
        "natural light filtering through the petals. Mid-shot, warm and cheerful tones."
    )

    video = generator.generate_video(prompt, output_path=OUTPUT_PATH, save_video=True, negative_prompt="", num_frames=81, fps=16)

    prompt2 = (
        "A majestic lion strides across the golden savanna, its powerful frame "
        "glistening under the warm afternoon sun. The tall grass ripples gently in "
        "the breeze, enhancing the lion's commanding presence. The tone is vibrant, "
        "embodying the raw energy of the wild. Low angle, steady tracking shot, "
        "cinematic.")

    video2 = generator.generate_video(prompt2, output_path=OUTPUT_PATH, save_video=True, negative_prompt="", num_frames=81, fps=16)


if __name__ == "__main__":
    main()

basic_hy15_1080p.py

from fastvideo import VideoGenerator
import json
# from fastvideo.api.sampling_param import SamplingParam

OUTPUT_PATH = "video_samples_hy15_1080p"
def main():
    # FastVideo will automatically use the optimal default arguments for the
    # model.
    # If a local path is provided, FastVideo will make a best effort
    # attempt to identify the optimal arguments.
    generator = VideoGenerator.from_pretrained(
        "weizhou03/HunyuanVideo-1.5-Diffusers-1080p-2SR", # 480p -> 720p -> 1080p
        # or "weizhou03/HunyuanVideo-1.5-Diffusers-1080p" # 720p -> 1080p
        # FastVideo will automatically handle distributed setup
        num_gpus=1,
        use_fsdp_inference=False, # set to True if GPU is out of memory
        dit_cpu_offload=True,
        vae_cpu_offload=True,
        text_encoder_cpu_offload=True,
        pin_cpu_memory=True, # set to false if low CPU RAM or hit obscure "CUDA error: Invalid argument"
        # image_encoder_cpu_offload=False,
    )

    prompt = (
        "A curious raccoon peers through a vibrant field of yellow sunflowers, its eyes "
        "wide with interest. The playful yet serene atmosphere is complemented by soft "
        "natural light filtering through the petals. Mid-shot, warm and cheerful tones."
    )

    video = generator.generate_video(prompt, output_path=OUTPUT_PATH, save_video=True, negative_prompt="")

    prompt2 = (
        "A majestic lion strides across the golden savanna, its powerful frame "
        "glistening under the warm afternoon sun. The tall grass ripples gently in "
        "the breeze, enhancing the lion's commanding presence. The tone is vibrant, "
        "embodying the raw energy of the wild. Low angle, steady tracking shot, "
        "cinematic.")

    video2 = generator.generate_video(prompt2, output_path=OUTPUT_PATH, save_video=True, negative_prompt="")


if __name__ == "__main__":
    main()

basic_hyworld.py

from fastvideo import VideoGenerator
from fastvideo.models.dits.hyworld.resolution_utils import get_resolution_from_image

# Default prompt from HY-WorldPlay run.sh
DEFAULT_PROMPT = 'A paved pathway leads towards a stone arch bridge spanning a calm body of water.  Lush green trees and foliage line the path and the far bank of the water. A traditional-style pavilion with a tiered, reddish-brown roof sits on the far shore. The water reflects the surrounding greenery and the sky.  The scene is bathed in soft, natural light, creating a tranquil and serene atmosphere. The pathway is composed of large, rectangular stones, and the bridge is constructed of light gray stone.  The overall composition emphasizes the peaceful and harmonious nature of the landscape.'
DEFAULT_IMAGE = 'https://raw.githubusercontent.com/Tencent-Hunyuan/HY-WorldPlay/main/assets/img/test.png'

OUTPUT_PATH = "video_samples_hyworld"
def main():
    import argparse

    # pose: (a, w, s, d) - (15, 31)
    # num_frames: (61, 125)
    parser = argparse.ArgumentParser(description="HYWorld video generation with FastVideo")
    parser.add_argument("--prompt", type=str, default=DEFAULT_PROMPT, help="Text prompt for video generation")
    parser.add_argument("--image", type=str, default=DEFAULT_IMAGE, help="Path or URL to input image")
    parser.add_argument("--pose", type=str, default='w-31', help="Pose string (e.g., 'a-31', 'w-31', 's-31', 'd-31')")
    parser.add_argument("--output_path", type=str, default=OUTPUT_PATH, help="Output video path")
    parser.add_argument("--num-frames", type=int, default=125, help="Number of frames")
    parser.add_argument("--seed", type=int, default=1, help="Random seed")
    parser.add_argument("--resolution", type=str, default="480p", help="Only support 480p for now")
    args = parser.parse_args()

    # Automatically determine resolution from input image
    HEIGHT, WIDTH = get_resolution_from_image(args.image, args.resolution)
    print(f"Image: {args.image}")
    print(f"Pose: {args.pose}")
    print(f"Resolution: {HEIGHT}x{WIDTH} (from {args.resolution} buckets)")
    print(f"Num frames: {args.num_frames}")
    print(f"Output path: {args.output_path}")

    # Initialize generator
    print("\nInitializing VideoGenerator for HYWorld...")
    generator = VideoGenerator.from_pretrained(
        "FastVideo/HY-WorldPlay-Bidirectional-Diffusers",
        num_gpus=1,
        use_fsdp_inference=True,
        dit_cpu_offload=True,
        vae_cpu_offload=True,
        text_encoder_cpu_offload=True,
        pin_cpu_memory=True,
        image_encoder_cpu_offload=True,
    )

    # Generate video
    # The pose string is automatically converted to camera matrices by the pipeline
    print("\nGenerating video...")
    generator.generate_video(
        prompt=args.prompt,
        image_path=args.image,
        pose=args.pose,  # Camera trajectory control
        output_path=args.output_path,
        save_video=True,
        negative_prompt="",
        num_frames=args.num_frames,
        fps=24,
        height=HEIGHT,
        width=WIDTH,
        seed=args.seed,
    )

    print(f"\nVideo saved to: {args.output_path}")


if __name__ == "__main__":
    main()

basic_kandinsky5_i2v.py

from fastvideo import VideoGenerator

OUTPUT_PATH = "video_samples_kandinsky5_i2v"

IMAGE_PATH = "assets/girl.png"


def main():
    generator = VideoGenerator.from_pretrained(
        "kandinskylab/Kandinsky-5.0-I2V-Pro-distilled-5s-Diffusers", 
        # "kandinskylab/Kandinsky-5.0-I2V-Pro-sft-5s-Diffusers"
        # "kandinskylab/Kandinsky-5.0-I2V-Lite-5s-Diffusers"
        num_gpus=1,
        use_fsdp_inference=False,
        dit_cpu_offload=False,
        vae_cpu_offload=False,
        text_encoder_cpu_offload=True,
        pin_cpu_memory=True,
        # image_encoder_cpu_offload=False,
    )

    prompt = (
        "A woman stands up and walks away"
    )
    _ = generator.generate_video(
        prompt,
        image_path=IMAGE_PATH,
        output_path=OUTPUT_PATH,
        save_video=True,
        height=1024,
        width=1024,
        num_frames=121,
    )


if __name__ == "__main__":
    main()

basic_kandinsky5_t2v.py

from fastvideo import VideoGenerator

OUTPUT_PATH = "video_samples_kandinsky5_t2v"

def main():
    generator = VideoGenerator.from_pretrained(
        "kandinskylab/Kandinsky-5.0-T2V-Lite-sft-5s-Diffusers",
        # "kandinskylab/Kandinsky-5.0-T2V-Pro-sft-5s-Diffusers"
        # "kandinskylab/Kandinsky-5.0-T2V-Lite-distilled16steps-5s-Diffusers"
        # "kandinskylab/Kandinsky-5.0-T2V-Pro-distilled-5s-Diffusers"
        num_gpus=1,
        use_fsdp_inference=False,
        dit_cpu_offload=False,
        vae_cpu_offload=False,
        text_encoder_cpu_offload=True,
        pin_cpu_memory=True,
        # image_encoder_cpu_offload=False,
    )

    prompt = (
        "A curious raccoon peers through a vibrant field of yellow sunflowers, its eyes "
        "wide with interest. The playful yet serene atmosphere is complemented by soft "
        "natural light filtering through the petals. Mid-shot, warm and cheerful tones."
    )
    _ = generator.generate_video(prompt, output_path=OUTPUT_PATH, save_video=True,height=512, width=768, num_frames=121)

    prompt2 = (
        "A majestic lion strides across the golden savanna, its powerful frame "
        "glistening under the warm afternoon sun. The tall grass ripples gently in "
        "the breeze, enhancing the lion's commanding presence. The tone is vibrant, "
        "embodying the raw energy of the wild. Low angle, steady tracking shot, "
        "cinematic.")
    _ = generator.generate_video(prompt2, output_path=OUTPUT_PATH, save_video=True, height=512, width=768, num_frames=121)


if __name__ == "__main__":
    main()

basic_lingbot_video.py

"""Generate a five-second Dense LingBot-Video clip with the official defaults."""

import argparse
from pathlib import Path

from fastvideo import VideoGenerator


def parse_args() -> argparse.Namespace:
    """Parse the converted checkpoint and output paths for the sample."""
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
        "--model-path",
        type=Path,
        required=True,
        help="Path to a converted Dense LingBot-Video checkpoint.",
    )
    parser.add_argument(
        "--output-path",
        type=Path,
        default=Path("outputs/lingbot-video/dense-t2v"),
        help="Directory for the generated video.",
    )
    return parser.parse_args()


def main() -> None:
    """Load the converted Dense checkpoint and generate the default T2V sample."""
    args = parse_args()
    generator = VideoGenerator.from_pretrained(
        str(args.model_path),
        num_gpus=1,
        use_fsdp_inference=False,
        text_encoder_cpu_offload=True,
        vae_cpu_offload=False,
        pin_cpu_memory=True,
    )
    try:
        generator.generate({
            "prompt": "A red fox runs through fresh snow at sunrise.",
            "output": {
                "output_path": str(args.output_path),
                "save_video": True,
                "return_frames": False,
            },
        })
    finally:
        generator.shutdown()


if __name__ == "__main__":
    main()

basic_lingbotworld2_causal_fast.py

# SPDX-License-Identifier: Apache-2.0
"""Run LingBot World 2 14B causal-fast I2V generation with FastVideo."""

import os
from pathlib import Path

from fastvideo import VideoGenerator


REPO_ROOT = Path(__file__).resolve().parents[3]
DATASET_DIR = REPO_ROOT / "examples" / "dataset" / "lingbotworld2"
OUTPUT_PATH = REPO_ROOT / "outputs" / "lingbotworld2_causal_fast.mp4"


def main() -> None:
    """Load the native FastVideo LingBot World 2 causal-fast pipeline and generate one video."""
    generator = VideoGenerator.from_pretrained(
        os.environ["LINGBOTWORLD2_MODEL_PATH"],
        num_gpus=8,
        sp_size=8,
        hsdp_shard_dim=8,
        use_fsdp_inference=True,
        dit_layerwise_offload=False,
        dit_cpu_offload=False,
        vae_cpu_offload=False,
        text_encoder_cpu_offload=False,
        pin_cpu_memory=True,
        override_pipeline_cls_name="LingBotWorld2CausalFastPipeline",
    )

    try:
        generator.generate_video(
            "A serene lakeside scene with a lone tree standing in calm water, surrounded by distant snow-capped mountains under a bright blue sky with drifting white clouds; gentle ripples reflect the tree and sky, creating a tranquil, meditative atmosphere.",
            image_path=str(DATASET_DIR / "image.jpg"),
            action_path=str(DATASET_DIR),
            output_path=str(OUTPUT_PATH),
            save_video=True,
            height=480,
            width=832,
            num_frames=65,
            num_inference_steps=4,
            guidance_scale=1.0,
            negative_prompt="",
            fps=16,
            seed=42,
        )
    finally:
        generator.shutdown()


if __name__ == "__main__":
    main()

basic_lingbotworld_base_cam.py

from fastvideo import VideoGenerator
from fastvideo.models.dits.lingbotworld.cam_utils import prepare_camera_embedding

# from fastvideo.api.sampling_param import SamplingParam
OUTPUT_PATH = "video_samples_lingbotworld"
def main():
    # FastVideo will automatically use the optimal default arguments for the
    # model.
    # If a local path is provided, FastVideo will make a best effort
    # attempt to identify the optimal arguments.
    generator = VideoGenerator.from_pretrained(
         "FastVideo/LingBot-World-Base-Cam-Diffusers",
        # FastVideo will automatically handle distributed setup
        num_gpus=1,
        use_fsdp_inference=False, # set to True if GPU is out of memory
        dit_cpu_offload=True, # DiT need to be offloaded for MoE
        vae_cpu_offload=False,
        text_encoder_cpu_offload=True,
        # Set pin_cpu_memory to false if CPU RAM is limited and there're no frequent CPU-GPU transfer
        pin_cpu_memory=True,
        # image_encoder_cpu_offload=False,
    )

    num_frames = 81
    prompt = "The video presents a soaring journey through a fantasy jungle. The wind whips past the rider's blue hands gripping the reins, causing the leather straps to vibrate. The ancient gothic castle approaches steadily, its stone details becoming clearer against the backdrop of floating islands and distant waterfalls."
    image_path = "https://raw.githubusercontent.com/Robbyant/lingbot-world/main/examples/00/image.jpg"
    action_path = "examples/inference/basic/lingbotworld_examples/00"
    c2ws_plucker_emb, num_frames = prepare_camera_embedding(
        action_path=action_path,
        num_frames=num_frames,
        height=480,
        width=832,
        spatial_scale=8,
    )

    generator.generate_video(
        prompt,
        image_path=image_path,
        output_path=OUTPUT_PATH,
        save_video=True,
        num_frames=num_frames,
        height=480,
        width=832,
        c2ws_plucker_emb=c2ws_plucker_emb,
    )


if __name__ == "__main__":
    main()

basic_longcat_i2v.py

"""
LongCat Image-to-Video (I2V) Example Script

This script demonstrates LongCat I2V inference using the FastVideo Python API.
LongCat I2V takes an input image and generates a video from it.

It runs both basic generation (50 steps) and distill+refine generation
(16 steps distill + 50 steps refinement to 720p with BSA).

Usage:
    python examples/inference/basic/basic_longcat_i2v.py

Note:
    Refinement uses 768x768 dimensions where latent (48x48) is divisible by 8,
    compatible with BSA chunks [4, 4, 8].
"""

import glob
import os

from fastvideo import VideoGenerator

# Common prompts and settings matching the shell script examples
PROMPT = (
    "A woman sits at a wooden table by the window in a cozy café. She reaches out "
    "with her right hand, picks up the white coffee cup from the saucer, and gently "
    "brings it to her lips to take a sip. After drinking, she places the cup back on "
    "the table and looks out the window, enjoying the peaceful atmosphere."
)

NEGATIVE_PROMPT = (
    "Bright tones, overexposed, static, blurred details, subtitles, style, works, "
    "paintings, images, static, overall gray, worst quality, low quality, JPEG compression "
    "residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, "
    "deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, "
    "three legs, many people in the background, walking backwards"
)

# Input image path
IMAGE_PATH = "assets/girl.png"

SEED = 42


def basic_generation():
    """
    Run basic LongCat I2V generation (50 steps at 480p).

    This uses the full 50-step denoising process for highest quality.
    """
    print("=" * 60)
    print("LongCat I2V: Basic Generation (50 steps, 480p)")
    print("=" * 60)

    generator = VideoGenerator.from_pretrained(
        "FastVideo/LongCat-Video-I2V-Diffusers",
        num_gpus=1,
        use_fsdp_inference=False, # set to True if GPU is out of memory
        dit_cpu_offload=False,
        vae_cpu_offload=True,
        text_encoder_cpu_offload=True,
        pin_cpu_memory=False,
        enable_bsa=False,
    )

    output_path = "outputs_video/longcat_i2v_basic"

    generator.generate_video(
        prompt=PROMPT,
        negative_prompt=NEGATIVE_PROMPT,
        image_path=IMAGE_PATH,
        output_path=output_path,
        save_video=True,
        height=480,
        width=480,  # Square
        num_frames=93,
        num_inference_steps=50,
        fps=15,
        guidance_scale=4.0,
        seed=SEED,
    )

    print(f"\nBasic generation complete! Video saved to: {output_path}")
    generator.shutdown()


def distill_refine_generation():
    """
    Run LongCat I2V with distill+refine pipeline (16 steps + refinement to 768p).

    This uses the distilled LoRA for fast 480p generation (16 steps),
    then refines to 768p using the refinement LoRA with BSA enabled.
    """
    print("\n" + "=" * 60)
    print("LongCat I2V: Distill + Refine Pipeline")
    print("=" * 60)

    # Stage 1: Distilled generation (16 steps at 480p)
    print("\n[Stage 1] Distilled generation (16 steps, 480p)")
    print("-" * 40)

    generator = VideoGenerator.from_pretrained(
        "FastVideo/LongCat-Video-I2V-Diffusers",
        num_gpus=1,
        use_fsdp_inference=True,
        dit_cpu_offload=False,
        vae_cpu_offload=True,
        text_encoder_cpu_offload=True,
        pin_cpu_memory=False,
        enable_bsa=False,
        lora_path="FastVideo/LongCat-Video-T2V-Distilled-LoRA",
        lora_nickname="distilled",
    )

    distill_output_path = "outputs_video/longcat_i2v_distill"

    generator.generate_video(
        prompt=PROMPT,
        negative_prompt=NEGATIVE_PROMPT,
        image_path=IMAGE_PATH,
        output_path=distill_output_path,
        save_video=True,
        height=480,
        width=480,  # Square
        num_frames=93,
        num_inference_steps=16,
        fps=15,
        guidance_scale=1.0,
        seed=SEED,
    )

    print(f"Distilled generation complete! Video saved to: {distill_output_path}")
    generator.shutdown()

    # Stage 2: Refinement (480p -> 768p)
    print("\n[Stage 2] Refinement (480p -> 768p with BSA)")
    print("-" * 40)

    # Find the actual saved video file from stage 1
    video_files = glob.glob(os.path.join(distill_output_path, "*.mp4"))
    if not video_files:
        raise FileNotFoundError(f"No video file found in {distill_output_path}")
    # Use the most recently created video file
    distill_video_path = max(video_files, key=os.path.getmtime)
    print(f"Using stage 1 video: {distill_video_path}")

    # Create a new generator with refinement LoRA and BSA enabled
    # Note: Refinement uses the T2V model (not I2V) since it's upscaling the generated video
    # For BSA [4, 4, 8]: latent must be divisible by 8
    # 768x768: latent 48x48, 48%8=0 ✓
    refine_generator = VideoGenerator.from_pretrained(
        "FastVideo/LongCat-Video-T2V-Diffusers",
        num_gpus=1,
        use_fsdp_inference=True,
        dit_cpu_offload=True,
        vae_cpu_offload=True,
        text_encoder_cpu_offload=True,
        pin_cpu_memory=False,
        enable_bsa=True,
        bsa_sparsity=0.875,
        bsa_chunk_q=[4, 4, 4],
        bsa_chunk_k=[4, 4, 4],
        lora_path="FastVideo/LongCat-Video-T2V-Refinement-LoRA",
        lora_nickname="refinement",
    )

    refine_output_path = "outputs_video/longcat_i2v_refine_720p"

    refine_generator.generate_video(
        prompt=PROMPT,
        negative_prompt=NEGATIVE_PROMPT,
        output_path=refine_output_path,
        save_video=True,
        refine_from=distill_video_path,
        t_thresh=0.5,
        spatial_refine_only=False,
        num_cond_frames=0,
        height=720,
        width=720,
        num_inference_steps=50,
        fps=30,
        guidance_scale=1.0,
        seed=SEED,
    )

    print(f"Refinement complete! Video saved to: {refine_output_path}")
    refine_generator.shutdown()


def main():
    """Run both basic and distill+refine generation pipelines."""
    print("\n" + "=" * 60)
    print("LongCat Image-to-Video Example")
    print("=" * 60 + "\n")

    # Run basic generation
    basic_generation()

    # Run distill+refine pipeline
    distill_refine_generation()

    print("\n" + "=" * 60)
    print("All generations complete!")
    print("=" * 60)


if __name__ == "__main__":
    main()

basic_longcat_t2v.py

"""
LongCat Text-to-Video (T2V) Example Script

This script demonstrates LongCat T2V inference using the FastVideo Python API.
It runs both basic generation (50 steps) and distill+refine generation
(16 steps distill + 50 steps refinement to 720p).

Usage:
    python examples/inference/basic/basic_longcat_t2v.py
"""

import glob
import os

from fastvideo import VideoGenerator

# Common prompts and settings matching the shell script examples
PROMPT = (
    "In a realistic photography style, a white boy around seven or eight years old "
    "sits on a park bench, wearing a light blue T-shirt, denim shorts, and white sneakers. "
    "He holds an ice cream cone with vanilla and chocolate flavors, and beside him is a "
    "medium-sized golden Labrador. Smiling, the boy offers the ice cream to the dog, "
    "who eagerly licks it with its tongue. The sun is shining brightly, and the background "
    "features a green lawn and several tall trees, creating a warm and loving scene."
)

NEGATIVE_PROMPT = (
    "Bright tones, overexposed, static, blurred details, subtitles, style, works, "
    "paintings, images, static, overall gray, worst quality, low quality, JPEG compression "
    "residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, "
    "deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, "
    "three legs, many people in the background, walking backwards"
)

SEED = 42


def basic_generation():
    """
    Run basic LongCat T2V generation (50 steps at 480p).

    This uses the full 50-step denoising process for highest quality.
    """
    print("=" * 60)
    print("LongCat T2V: Basic Generation (50 steps, 480p)")
    print("=" * 60)

    generator = VideoGenerator.from_pretrained(
        "FastVideo/LongCat-Video-T2V-Diffusers",
        num_gpus=1,
        use_fsdp_inference=False, # set to True if GPU is out of memory
        dit_cpu_offload=False,
        vae_cpu_offload=True,
        text_encoder_cpu_offload=True,
        pin_cpu_memory=False,
        enable_bsa=False,
    )

    output_path = "outputs_video/longcat_t2v_basic"

    generator.generate_video(
        prompt=PROMPT,
        negative_prompt=NEGATIVE_PROMPT,
        output_path=output_path,
        save_video=True,
        height=480,
        width=832,
        num_frames=93,
        num_inference_steps=50,
        fps=15,
        guidance_scale=4.0,
        seed=SEED,
    )

    print(f"\nBasic generation complete! Video saved to: {output_path}")
    generator.shutdown()


def distill_refine_generation():
    """
    Run LongCat T2V with distill+refine pipeline (16 steps + refinement to 720p).

    This uses the distilled LoRA for fast 480p generation (16 steps),
    then refines to 720p using the refinement LoRA with BSA enabled.
    """
    print("\n" + "=" * 60)
    print("LongCat T2V: Distill + Refine Pipeline")
    print("=" * 60)

    # Stage 1: Distilled generation (16 steps at 480p)
    print("\n[Stage 1] Distilled generation (16 steps, 480p)")
    print("-" * 40)

    generator = VideoGenerator.from_pretrained(
        "FastVideo/LongCat-Video-T2V-Diffusers",
        num_gpus=1,
        use_fsdp_inference=True,
        dit_cpu_offload=False,
        vae_cpu_offload=True,
        text_encoder_cpu_offload=True,
        pin_cpu_memory=False,
        enable_bsa=False,
        lora_path="FastVideo/LongCat-Video-T2V-Distilled-LoRA",
        lora_nickname="distilled",
    )

    distill_output_path = "outputs_video/longcat_t2v_distill"

    generator.generate_video(
        prompt=PROMPT,
        negative_prompt=NEGATIVE_PROMPT,
        output_path=distill_output_path,
        save_video=True,
        height=480,
        width=832,
        num_frames=93,
        num_inference_steps=16,
        fps=15,
        guidance_scale=1.0,
        seed=SEED,
    )

    print(f"Distilled generation complete! Video saved to: {distill_output_path}")
    generator.shutdown()

    # Stage 2: Refinement (480p -> 720p)
    print("\n[Stage 2] Refinement (480p -> 720p with BSA)")
    print("-" * 40)

    # Find the actual saved video file from stage 1
    video_files = glob.glob(os.path.join(distill_output_path, "*.mp4"))
    if not video_files:
        raise FileNotFoundError(f"No video file found in {distill_output_path}")
    # Use the most recently created video file
    distill_video_path = max(video_files, key=os.path.getmtime)
    print(f"Using stage 1 video: {distill_video_path}")

    # Create a new generator with refinement LoRA and BSA enabled
    refine_generator = VideoGenerator.from_pretrained(
        "FastVideo/LongCat-Video-T2V-Diffusers",
        num_gpus=1,
        use_fsdp_inference=True,
        dit_cpu_offload=True,
        vae_cpu_offload=True,
        text_encoder_cpu_offload=True,
        pin_cpu_memory=False,
        enable_bsa=True,
        bsa_sparsity=0.875,
        bsa_chunk_q=[4, 4, 8],
        bsa_chunk_k=[4, 4, 8],
        lora_path="FastVideo/LongCat-Video-T2V-Refinement-LoRA",
        lora_nickname="refinement",
    )

    refine_output_path = "outputs_video/longcat_t2v_refine_720p"

    refine_generator.generate_video(
        prompt=PROMPT,
        negative_prompt=NEGATIVE_PROMPT,
        output_path=refine_output_path,
        save_video=True,
        refine_from=distill_video_path,
        t_thresh=0.5,
        spatial_refine_only=False,
        num_cond_frames=0,
        height=720,
        width=1280,
        num_inference_steps=50,
        fps=30,
        guidance_scale=1.0,
        seed=SEED,
    )

    print(f"Refinement complete! Video saved to: {refine_output_path}")
    refine_generator.shutdown()


def main():
    """Run both basic and distill+refine generation pipelines."""
    print("\n" + "=" * 60)
    print("LongCat Text-to-Video Example")
    print("=" * 60 + "\n")

    # Run basic generation
    basic_generation()

    # Run distill+refine pipeline
    distill_refine_generation()

    print("\n" + "=" * 60)
    print("All generations complete!")
    print("=" * 60)


if __name__ == "__main__":
    main()

basic_longcat_vc.py

"""
LongCat Video Continuation (VC) Example Script

This script demonstrates LongCat VC inference using the FastVideo Python API.
LongCat VC takes an input video and generates a continuation of it.

It runs both basic generation (50 steps) and distill+refine generation
(16 steps distill + 50 steps refinement to 720p).

Usage:
    python examples/inference/basic/basic_longcat_vc.py

Prerequisites:
    - Ensure the input video exists at assets/motorcycle.mp4
      (or provide your own video)
"""

import glob
import os

from fastvideo import VideoGenerator

# Common prompts and settings matching the shell script examples
PROMPT = (
    "A person rides a motorcycle along a long, straight road that stretches between "
    "a body of water and a forested hillside. The rider steadily accelerates, keeping "
    "the motorcycle centered between the guardrails, while the scenery passes by on "
    "both sides. The video captures the journey from the rider's perspective, emphasizing "
    "the sense of motion and adventure."
)

NEGATIVE_PROMPT = (
    "Bright tones, overexposed, static, blurred details, subtitles, style, works, "
    "paintings, images, static, overall gray, worst quality, low quality, JPEG compression "
    "residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, "
    "deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, "
    "three legs, many people in the background, walking backwards"
)

# Input video path
VIDEO_PATH = "assets/motorcycle.mp4"

# Number of conditioning frames from the input video
NUM_COND_FRAMES = 13

SEED = 42


def basic_generation():
    """
    Run basic LongCat VC generation (50 steps at 480p).

    This uses the full 50-step denoising process for highest quality.
    """
    print("=" * 60)
    print("LongCat VC: Basic Generation (50 steps, 480p)")
    print("=" * 60)

    # Check if video exists
    if not os.path.exists(VIDEO_PATH):
        raise FileNotFoundError(
            f"Video not found at {VIDEO_PATH}. "
            "Please provide a valid video path."
        )

    generator = VideoGenerator.from_pretrained(
        "FastVideo/LongCat-Video-VC-Diffusers",
        num_gpus=1,
        use_fsdp_inference=False, # set to True if GPU is out of memory
        dit_cpu_offload=False,
        vae_cpu_offload=True,
        text_encoder_cpu_offload=True,
        pin_cpu_memory=False,
        enable_bsa=False,
    )

    output_path = "outputs_video/longcat_vc_basic"

    generator.generate_video(
        prompt=PROMPT,
        negative_prompt=NEGATIVE_PROMPT,
        video_path=VIDEO_PATH,
        num_cond_frames=NUM_COND_FRAMES,
        output_path=output_path,
        save_video=True,
        height=480,
        width=832,
        num_frames=93,
        num_inference_steps=50,
        fps=15,
        guidance_scale=4.0,
        seed=SEED,
    )

    print(f"\nBasic generation complete! Video saved to: {output_path}")
    generator.shutdown()


def distill_refine_generation():
    """
    Run LongCat VC with distill+refine pipeline (16 steps + refinement to 720p).

    This uses the distilled LoRA for fast 480p generation (16 steps),
    then refines to 720p using the refinement LoRA with BSA enabled.
    """
    print("\n" + "=" * 60)
    print("LongCat VC: Distill + Refine Pipeline")
    print("=" * 60)

    # Check if video exists
    if not os.path.exists(VIDEO_PATH):
        raise FileNotFoundError(
            f"Video not found at {VIDEO_PATH}. "
            "Please provide a valid video path."
        )

    # Stage 1: Distilled generation (16 steps at 480p)
    print("\n[Stage 1] Distilled generation (16 steps, 480p)")
    print("-" * 40)

    generator = VideoGenerator.from_pretrained(
        "FastVideo/LongCat-Video-VC-Diffusers",
        num_gpus=1,
        use_fsdp_inference=True,
        dit_cpu_offload=False,
        vae_cpu_offload=True,
        text_encoder_cpu_offload=True,
        pin_cpu_memory=False,
        enable_bsa=False,
        lora_path="FastVideo/LongCat-Video-T2V-Distilled-LoRA",
        lora_nickname="distilled",
    )

    distill_output_path = "outputs_video/longcat_vc_distill"

    generator.generate_video(
        prompt=PROMPT,
        negative_prompt=NEGATIVE_PROMPT,
        video_path=VIDEO_PATH,
        num_cond_frames=NUM_COND_FRAMES,
        output_path=distill_output_path,
        save_video=True,
        height=480,
        width=832,
        num_frames=93,
        num_inference_steps=16,
        fps=15,
        guidance_scale=1.0,
        seed=SEED,
    )

    print(f"Distilled generation complete! Video saved to: {distill_output_path}")
    generator.shutdown()

    # Stage 2: Refinement (480p -> 720p)
    print("\n[Stage 2] Refinement (480p -> 720p with BSA)")
    print("-" * 40)

    # Find the actual saved video file from stage 1
    video_files = glob.glob(os.path.join(distill_output_path, "*.mp4"))
    if not video_files:
        raise FileNotFoundError(f"No video file found in {distill_output_path}")
    # Use the most recently created video file
    distill_video_path = max(video_files, key=os.path.getmtime)
    print(f"Using stage 1 video: {distill_video_path}")

    # Create a new generator with refinement LoRA and BSA enabled
    # Note: Refinement uses the T2V model (not VC) since it's upscaling the generated video
    refine_generator = VideoGenerator.from_pretrained(
        "FastVideo/LongCat-Video-T2V-Diffusers",
        num_gpus=1,
        use_fsdp_inference=True,
        dit_cpu_offload=True,
        vae_cpu_offload=True,
        text_encoder_cpu_offload=True,
        pin_cpu_memory=False,
        enable_bsa=True,
        bsa_sparsity=0.875,
        bsa_chunk_q=[4, 4, 8],
        bsa_chunk_k=[4, 4, 8],
        lora_path="FastVideo/LongCat-Video-T2V-Refinement-LoRA",
        lora_nickname="refinement",
    )

    refine_output_path = "outputs_video/longcat_vc_refine_720p"

    refine_generator.generate_video(
        prompt=PROMPT,
        negative_prompt=NEGATIVE_PROMPT,
        output_path=refine_output_path,
        save_video=True,
        refine_from=distill_video_path,
        t_thresh=0.5,
        spatial_refine_only=False,
        num_cond_frames=0,  # For refinement, no conditioning frames
        height=720,
        width=1280,
        num_inference_steps=50,
        fps=30,
        guidance_scale=1.0,
        seed=SEED,
    )

    print(f"Refinement complete! Video saved to: {refine_output_path}")
    refine_generator.shutdown()


def main():
    """Run both basic and distill+refine generation pipelines."""
    print("\n" + "=" * 60)
    print("LongCat Video Continuation Example")
    print("=" * 60 + "\n")

    # Run basic generation
    basic_generation()

    # Run distill+refine pipeline
    distill_refine_generation()

    print("\n" + "=" * 60)
    print("All generations complete!")
    print("=" * 60)


if __name__ == "__main__":
    main()

basic_ltx2.py

from fastvideo import VideoGenerator


PROMPT = (
    "A warm sunny backyard. The camera starts in a tight cinematic close-up "
    "of a woman and a man in their 30s, facing each other with serious "
    "expressions. The woman, emotional and dramatic, says softly, \"That's "
    "it... Dad's lost it. And we've lost Dad.\" The man exhales, slightly "
    "annoyed: \"Stop being so dramatic, Jess.\" A beat. He glances aside, "
    "then mutters defensively, \"He's just having fun.\" The camera slowly "
    "pans right, revealing the grandfather in the garden wearing enormous "
    "butterfly wings, waving his arms in the air like he's trying to take "
    "off. He shouts, \"Wheeeew!\" as he flaps his wings with full commitment. "
    "The woman covers her face, on the verge of tears. The tone is deadpan, "
    "absurd, and quietly tragic."
)


def main() -> None:
    # Uses FastVideo default sampling settings for LTX2 base.
    generator = VideoGenerator.from_pretrained(
        "Davids048/LTX2-Base-Diffusers",
        num_gpus=1,
    )

    output_path = "outputs_video/ltx2_basic/output_ltx2_base_t2v_1088_1920_1.1.mp4"
    generator.generate_video(
        prompt=PROMPT,
        output_path=output_path,
        save_video=True,
        num_frames=121,
        height=1088,
        width=1920,
    )
    generator.shutdown()


if __name__ == "__main__":
    main()

basic_ltx2_3_distilled_i2v.py

# SPDX-License-Identifier: Apache-2.0
"""LTX-2.3 distilled image-to-video with torch.compile + timing breakdown.

This example runs the LTX-2.3 distilled student model on a single GPU with
torch.compile fully enabled, then prints a per-stage timing breakdown so the
user can see where wall-time goes. It is meant as the canonical entry point
for trying out the LTX-2.3 i2v path on `hao-ai-lab/FastVideo:main`.

Quick start
-----------
    export LTX23_I2V_IMAGE=/path/to/your/portrait_or_product.jpg
    # optional overrides:
    #   export LTX23_I2V_PROMPT="a fashion model walks toward camera..."
    #   export LTX23_OUTPUT_DIR=outputs_video/ltx2_3_distilled_i2v
    python examples/inference/basic/basic_ltx2_3_distilled_i2v.py

What the script does
--------------------
1. Loads FastVideo/LTX-2.3-Distilled-Diffusers (8 denoise + 3 refine steps,
   CFG=1, no refine LoRA — the distilled production recipe).
2. Compiles the DiT, text encoder, and VAE (fullgraph, Inductor default
   mode — autotune adds ~7 min cold-compile here with no measurable
   e2e gain).
3. Runs 2 warmup calls (untimed) + 2 measured calls. Two warmups are kept
   as a safety net — the first call pays cold compile + first-shape guard
   work, and a second warmup ensures any residual recompiles settle before
   we measure.
4. Prints a per-stage breakdown and an average over the measured runs.

Hardware notes
--------------
- Single-GPU example; for multi-GPU sequence-parallel see the gradio demo
  under `examples/inference/gradio/local/gradio_local_demo_ltx2_3/`.
- First-time compile takes ~30-40 min on GB200 (~20 min on H100; cached
  in `$TORCHINDUCTOR_CACHE_DIR` afterwards). Subsequent invocations only
  pay the one-time process load + a few seconds of dynamo trace.
- On GB200 / Blackwell, run with `env -u LD_LIBRARY_PATH ...` to avoid a
  system-cuBLAS / torch-cuBLAS mismatch that fails every GEMM. The
  `_inductor.shape_padding = False` line below also avoids a pad_mm
  landmine on the same generation of cards.
"""
from __future__ import annotations

import os
import time
from collections import OrderedDict
from pathlib import Path

import torch._inductor.config as _inductor

from fastvideo import VideoGenerator
from fastvideo.configs.pipelines.base import PipelineConfig
from fastvideo.utils import maybe_download_model

# Env knobs (set BEFORE importing fastvideo where possible — but
# FASTVIDEO_ATTENTION_BACKEND is fine here because the worker reads it
# on generator construction).
os.environ.setdefault("FASTVIDEO_ATTENTION_BACKEND", "FLASH_ATTN")
os.environ.setdefault("FASTVIDEO_STAGE_LOGGING", "1")

# Inductor knobs. The first one (shape_padding=False) is mandatory on
# Blackwell to avoid a cuBLAS INVALID_VALUE crash inside pad_mm during
# the refine path. The rest are autotune-friendliness flags.
_inductor.shape_padding = False
_inductor.conv_1x1_as_mm = True
_inductor.coordinate_descent_tuning = True
_inductor.coordinate_descent_check_all_directions = True
_inductor.epilogue_fusion = False

MODEL_ID = os.path.expandvars(
    os.path.expanduser(
        os.getenv("LTX23_MODEL_PATH", "FastVideo/LTX-2.3-Distilled-Diffusers")
    )
)
OUTPUT_DIR = Path(
    os.getenv("LTX23_OUTPUT_DIR", "outputs_video/ltx2_3_distilled_i2v")
)
I2V_IMAGE = os.getenv("LTX23_I2V_IMAGE", "")
DEFAULT_PROMPT = (
    "A fashion model takes a slow step forward and shifts her weight, "
    "the soft fabric of her clothing swaying and rippling with the "
    "motion, her hair shifting gently, soft even studio lighting on a "
    "clean light background, elegant slow-motion runway feel."
)
PROMPT = os.getenv("LTX23_I2V_PROMPT", DEFAULT_PROMPT)

# Per-stage timing helpers --------------------------------------------------

def _print_stage_breakdown(result: dict, label: str) -> float | None:
    """Print stage execution times and return the sum, or None if missing."""
    logging_info = result.get("logging_info")
    stages = getattr(logging_info, "stages", None) if logging_info else None
    if not stages:
        print(f"  [{label}] stage breakdown unavailable")
        return None
    print(f"  [{label}] stage breakdown:")
    total = 0.0
    for name, metrics in stages.items():
        exec_s = float(metrics.get("execution_time", 0.0))
        total += exec_s
        print(f"    - {name}: {exec_s:.3f}s")
    print(f"    - stage_sum: {total:.3f}s")
    return total


def _collect_stage_times(
    result: dict,
    stage_times: dict[str, list[float]],
    stage_order: OrderedDict[str, None],
) -> None:
    logging_info = result.get("logging_info")
    stages = getattr(logging_info, "stages", None) if logging_info else None
    if not stages:
        return
    for name, metrics in stages.items():
        stage_order.setdefault(name, None)
        stage_times.setdefault(name, []).append(
            float(metrics.get("execution_time", 0.0))
        )


def _resolve_refine_upsampler(model_root: str) -> Path:
    """LTX-2.3 distilled snapshots ship a `spatial_upscaler/` subdir."""
    for name in ("spatial_upscaler", "spatial_upsampler"):
        cand = Path(model_root) / name
        if (cand / "config.json").is_file():
            return cand
    raise FileNotFoundError(
        f"No refine upsampler directory under {model_root}. "
        f"Expected `{model_root}/spatial_upscaler/config.json`."
    )


# Main ---------------------------------------------------------------------

def main() -> None:
    if not I2V_IMAGE:
        raise SystemExit(
            "LTX23_I2V_IMAGE is required for i2v. Example:\n"
            "  export LTX23_I2V_IMAGE=/path/to/portrait_or_product.jpg\n"
            "  python examples/inference/basic/basic_ltx2_3_distilled_i2v.py"
        )
    if not Path(I2V_IMAGE).is_file():
        raise SystemExit(f"LTX23_I2V_IMAGE not found: {I2V_IMAGE}")

    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
    model_root = maybe_download_model(MODEL_ID)
    refine_upsampler_path = _resolve_refine_upsampler(model_root)
    print(f"Model:           {model_root}")
    print(f"Refine upsampler: {refine_upsampler_path}")
    print(f"i2v image:       {I2V_IMAGE}")
    print(f"Output dir:      {OUTPUT_DIR.resolve()}")

    # mode="default" — Inductor's default schedule matches max-autotune on
    # this pipeline (denoise/refine/decode all within ~5 ms, n=2) while
    # saving ~7 min of cold compile on a single GB200.
    torch_compile_kwargs = {
        "backend": "inductor",
        "fullgraph": True,
        "mode": "default",
        "dynamic": False,
    }

    # Loading the pipeline config *with model_path* binds model-specific
    # tuning (notably VAE precision/decoder defaults) into the config. Without
    # this, the generic pipeline config gives a substantially slower VAE
    # decode stage. `basic_ltx2_distilled_fast_profile.py` uses the same
    # pattern.
    pipeline_config = PipelineConfig.from_pretrained(model_root)
    pipeline_config.dit_config.quant_config = None

    generator = VideoGenerator.from_pretrained(
        model_root,
        num_gpus=1,
        # LTX-2.3 distilled uses the two-stage refine pipeline; the refine
        # LoRA is intentionally empty for the distilled student.
        ltx2_refine_enabled=True,
        ltx2_refine_upsampler_path=str(refine_upsampler_path),
        ltx2_refine_lora_path="",
        ltx2_refine_num_inference_steps=3,
        ltx2_refine_guidance_scale=1.0,
        ltx2_refine_add_noise=True,
        pipeline_config=pipeline_config,
        enable_torch_compile=True,
        enable_torch_compile_text_encoder=True,
        # Compile the VAE codec submodules (encoder / decoder) too. The
        # `LTX2CausalVideoAutoencoder` declares `_compile_conditions` so
        # `_compile_with_conditions` targets just those submodules and
        # leaves the surrounding tiling control flow eager — needed for
        # fullgraph + dynamic=False to succeed. VAE eager decode is
        # ~1.0s; compiling it brings the stage to ~0.3s.
        enable_torch_compile_vae=True,
        torch_compile_kwargs=torch_compile_kwargs,
        torch_compile_kwargs_vae=torch_compile_kwargs,
        # Keep everything resident — no CPU offload for serving-style runs.
        dit_cpu_offload=False,
        text_encoder_cpu_offload=False,
        vae_cpu_offload=False,
        ltx2_vae_tiling=False,
    )

    common_kwargs = dict(
        prompt=PROMPT,
        negative_prompt="",         # distilled is CFG-free; no negative needed
        guidance_scale=1.0,         # CFG=1 for distilled
        height=1280, width=832,     # portrait runway aspect
        num_frames=121, fps=24,     # ~5s clip
        num_inference_steps=8,      # distilled denoise steps
        # i2v: anchor the input image at frame 0 with full strength.
        # `ltx2_image_crf=0.0` skips an extra JPEG re-encode of an already
        # JPEG conditioning image.
        ltx2_images=[(I2V_IMAGE, 0, 1.0)],
        ltx2_image_crf=0.0,
        save_video=True,
    )

    warmup_runs = 2
    measured_runs = 2
    warmup_secs: list[float] = []
    measured_secs: list[float] = []
    stage_times: dict[str, list[float]] = {}
    stage_order: OrderedDict[str, None] = OrderedDict()

    try:
        # Warmup: untimed (but we still wall-clock them so the first compile
        # cost is visible to the reader).
        for w in range(warmup_runs):
            t0 = time.perf_counter()
            print(f"\n[warmup {w + 1}/{warmup_runs}] compiling + generating…")
            generator.generate_video(
                output_path=str(OUTPUT_DIR / f"_warmup_{w + 1}.mp4"),
                seed=7,
                **common_kwargs,
            )
            dt = time.perf_counter() - t0
            warmup_secs.append(dt)
            print(f"[warmup {w + 1}/{warmup_runs}] wall={dt:.1f}s")

        # Cleanup warmup artifacts so the user only sees measured outputs.
        for w in range(warmup_runs):
            (OUTPUT_DIR / f"_warmup_{w + 1}.mp4").unlink(missing_ok=True)

        # Measured.
        for m in range(measured_runs):
            out_path = OUTPUT_DIR / f"output_ltx2_3_distilled_i2v_run_{m + 1}.mp4"
            print(f"\n[measured {m + 1}/{measured_runs}] generating: {out_path}")
            t0 = time.perf_counter()
            result = generator.generate_video(
                output_path=str(out_path),
                seed=2002 + m,
                **common_kwargs,
            )
            wall = time.perf_counter() - t0
            e2e = (
                result.get("e2e_latency")
                if isinstance(result, dict) else None
            ) or wall
            measured_secs.append(e2e)
            print(f"[measured {m + 1}/{measured_runs}] e2e={e2e:.2f}s wall={wall:.2f}s")
            if isinstance(result, dict):
                _print_stage_breakdown(result, f"measured {m + 1}")
                _collect_stage_times(result, stage_times, stage_order)

        # Summary.
        print("\n=== summary ===")
        print(f"warmup wall-times:      {[round(x, 1) for x in warmup_secs]}")
        if measured_secs:
            avg = sum(measured_secs) / len(measured_secs)
            print(
                f"measured e2e (n={len(measured_secs)}): "
                f"{[round(x, 2) for x in measured_secs]} -> avg {avg:.2f}s"
            )
        if stage_times:
            print(f"average stage times over {measured_runs} measured runs:")
            avg_total = 0.0
            for name in stage_order:
                vals = stage_times.get(name) or []
                if not vals:
                    continue
                avg_v = sum(vals) / len(vals)
                avg_total += avg_v
                print(f"  - {name}: {avg_v:.3f}s")
            print(f"  - stage_sum_avg: {avg_total:.3f}s")
    finally:
        generator.shutdown()


if __name__ == "__main__":
    main()

basic_ltx2_3_distilled_i2v_typed.py

# SPDX-License-Identifier: Apache-2.0
"""LTX-2.3 distilled image-to-video — typed API (``from_config`` / ``generate``).

Identical generation behavior to ``basic_ltx2_3_distilled_i2v.py``, but
expressed through the newer typed surface (``GeneratorConfig`` /
``GenerationRequest``) instead of the ``from_pretrained(**legacy_kwargs)``
bridge. The typed API is now the preferred entry point — the legacy
example still works but emits a ``DeprecationWarning`` for the LTX-2.3
specific knobs.

Quick start
-----------
    export LTX23_I2V_IMAGE=/path/to/your/portrait_or_product.jpg
    # optional overrides:
    #   export LTX23_I2V_PROMPT="a fashion model walks toward camera..."
    #   export LTX23_OUTPUT_DIR=outputs_video/ltx2_3_distilled_i2v_typed
    python examples/inference/basic/basic_ltx2_3_distilled_i2v_typed.py

What the script does
--------------------
1. Loads FastVideo/LTX-2.3-Distilled-Diffusers (8 denoise + 3 refine
   steps, CFG=1, no refine LoRA — the distilled production recipe).
2. Compiles the DiT, text encoder, and VAE (fullgraph, Inductor default
   mode — autotune adds ~7 min cold-compile here with no measurable
   e2e gain).
3. Runs 2 warmup calls (untimed) + 2 measured calls. Two warmups are
   kept as a safety net — the first call pays cold compile + first-shape
   guard work, and a second warmup ensures any residual recompiles
   settle before we measure.
4. Prints a per-stage breakdown and an average over the measured runs.

Hardware notes
--------------
- Single-GPU example; for multi-GPU sequence-parallel see the gradio
  demo under ``examples/inference/gradio/local/gradio_local_demo_ltx2_3/``.
- First-time compile takes ~30-40 min on GB200 (~20 min on H100;
  cached in ``$TORCHINDUCTOR_CACHE_DIR`` afterwards). Subsequent
  invocations only pay the one-time process load + a few seconds of
  dynamo trace.
- On GB200 / Blackwell, run with ``env -u LD_LIBRARY_PATH ...`` to
  avoid a system-cuBLAS / torch-cuBLAS mismatch that fails every GEMM.
  The ``_inductor.shape_padding = False`` line below also avoids a
  ``pad_mm`` landmine on the same generation of cards.

Typed-API mapping (legacy kwarg ↔ typed field)
----------------------------------------------
- ``num_gpus``                      ↔ ``engine.num_gpus``
- ``enable_torch_compile``          ↔ ``engine.compile.enabled``
- ``enable_torch_compile_text_encoder`` ↔ ``engine.compile.text_encoder_enabled``
- ``enable_torch_compile_vae``      ↔ ``engine.compile.vae_enabled``
- ``torch_compile_kwargs``          ↔ ``engine.compile.backend/fullgraph/mode/dynamic``
- ``torch_compile_kwargs_vae``      ↔ empty ``compile.vae_kwargs`` (inherits master)
- ``dit_cpu_offload``               ↔ ``engine.offload.dit``
- ``text_encoder_cpu_offload``      ↔ ``engine.offload.text_encoder``
- ``vae_cpu_offload``               ↔ ``engine.offload.vae``
- ``ltx2_vae_tiling``               ↔ ``pipeline.vae_tiling``
- ``ltx2_refine_enabled``           ↔ ``pipeline.preset_overrides["refine"]["enabled"]``
- ``ltx2_refine_upsampler_path``    ↔ ``pipeline.components.upsampler_weights``
- ``ltx2_refine_lora_path``         ↔ ``pipeline.components.lora_path``
- ``ltx2_refine_num_inference_steps`` ↔ ``pipeline.preset_overrides["refine"]["num_inference_steps"]``
- ``ltx2_refine_guidance_scale``    ↔ ``pipeline.preset_overrides["refine"]["guidance_scale"]``
- ``ltx2_refine_add_noise``         ↔ ``pipeline.preset_overrides["refine"]["add_noise"]``
- ``pipeline_config=PipelineConfig.from_pretrained(model_root)`` ↔ (no-op — ``PipelineConfig.from_kwargs`` already resolves the model-specific class from ``model_path``)
- ``pipeline_config.dit_config.quant_config = None`` ↔ leave ``engine.quantization`` unset
- ``ltx2_images`` / ``ltx2_image_crf`` ↔ ``request.extensions`` (LTX-2 specific, no
  first-class typed field yet)
"""
from __future__ import annotations

import os
import time
from collections import OrderedDict
from pathlib import Path

import torch._inductor.config as _inductor

from fastvideo import VideoGenerator
from fastvideo.api import (
    CompileConfig,
    ComponentConfig,
    EngineConfig,
    GenerationRequest,
    GeneratorConfig,
    OffloadConfig,
    OutputConfig,
    PipelineSelection,
    SamplingConfig,
)
from fastvideo.utils import maybe_download_model

os.environ.setdefault("FASTVIDEO_ATTENTION_BACKEND", "FLASH_ATTN")
os.environ.setdefault("FASTVIDEO_STAGE_LOGGING", "1")

# Inductor knobs. ``shape_padding=False`` is mandatory on Blackwell to
# avoid a cuBLAS INVALID_VALUE crash inside pad_mm during the refine
# path. The rest are autotune-friendliness flags.
_inductor.shape_padding = False
_inductor.conv_1x1_as_mm = True
_inductor.coordinate_descent_tuning = True
_inductor.coordinate_descent_check_all_directions = True
_inductor.epilogue_fusion = False

MODEL_ID = os.path.expandvars(
    os.path.expanduser(
        os.getenv("LTX23_MODEL_PATH", "FastVideo/LTX-2.3-Distilled-Diffusers")
    )
)
OUTPUT_DIR = Path(
    os.getenv(
        "LTX23_OUTPUT_DIR", "outputs_video/ltx2_3_distilled_i2v_typed"
    )
)
I2V_IMAGE = os.getenv("LTX23_I2V_IMAGE", "")
DEFAULT_PROMPT = (
    "A fashion model takes a slow step forward and shifts her weight, "
    "the soft fabric of her clothing swaying and rippling with the "
    "motion, her hair shifting gently, soft even studio lighting on a "
    "clean light background, elegant slow-motion runway feel."
)
PROMPT = os.getenv("LTX23_I2V_PROMPT", DEFAULT_PROMPT)


def _print_stage_breakdown(result, label: str) -> float | None:
    logging_info = getattr(result, "logging_info", None)
    stages = getattr(logging_info, "stages", None) if logging_info else None
    if not stages:
        print(f"  [{label}] stage breakdown unavailable")
        return None
    print(f"  [{label}] stage breakdown:")
    total = 0.0
    for name, metrics in stages.items():
        exec_s = float(metrics.get("execution_time", 0.0))
        total += exec_s
        print(f"    - {name}: {exec_s:.3f}s")
    print(f"    - stage_sum: {total:.3f}s")
    return total


def _collect_stage_times(
    result,
    stage_times: dict[str, list[float]],
    stage_order: OrderedDict[str, None],
) -> None:
    logging_info = getattr(result, "logging_info", None)
    stages = getattr(logging_info, "stages", None) if logging_info else None
    if not stages:
        return
    for name, metrics in stages.items():
        stage_order.setdefault(name, None)
        stage_times.setdefault(name, []).append(
            float(metrics.get("execution_time", 0.0))
        )


def _resolve_refine_upsampler(model_root: str) -> Path:
    for name in ("spatial_upscaler", "spatial_upsampler"):
        cand = Path(model_root) / name
        if (cand / "config.json").is_file():
            return cand
    raise FileNotFoundError(
        f"No refine upsampler directory under {model_root}. "
        f"Expected `{model_root}/spatial_upscaler/config.json`."
    )


def main() -> None:
    if not I2V_IMAGE:
        raise SystemExit(
            "LTX23_I2V_IMAGE is required for i2v. Example:\n"
            "  export LTX23_I2V_IMAGE=/path/to/portrait_or_product.jpg\n"
            "  python examples/inference/basic/"
            "basic_ltx2_3_distilled_i2v_typed.py"
        )
    if not Path(I2V_IMAGE).is_file():
        raise SystemExit(f"LTX23_I2V_IMAGE not found: {I2V_IMAGE}")

    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
    model_root = maybe_download_model(MODEL_ID)
    refine_upsampler_path = _resolve_refine_upsampler(model_root)
    print(f"Model:           {model_root}")
    print(f"Refine upsampler: {refine_upsampler_path}")
    print(f"i2v image:       {I2V_IMAGE}")
    print(f"Output dir:      {OUTPUT_DIR.resolve()}")

    # mode="default" — Inductor's default schedule matches max-autotune on
    # this pipeline (denoise/refine/decode all within ~5 ms, n=2) while
    # saving ~7 min of cold compile on a single GB200.
    generator_config = GeneratorConfig(
        model_path=model_root,
        engine=EngineConfig(
            num_gpus=1,
            # Keep DiT / text encoder / VAE resident on GPU — no CPU offload
            # for serving-style runs. ``image_encoder`` and
            # ``pin_cpu_memory`` are left at their schema defaults
            # (matches the legacy example, which only set these three).
            offload=OffloadConfig(
                dit=False,
                text_encoder=False,
                vae=False,
            ),
            compile=CompileConfig(
                enabled=True,
                text_encoder_enabled=True,
                # ``vae_enabled`` triggers ``_compile_with_conditions`` on
                # ``LTX2CausalVideoAutoencoder``, which compiles just the
                # encoder/decoder submodules and leaves the surrounding
                # tiling control flow eager (required for ``fullgraph``).
                # Empty ``vae_kwargs`` → inherits the master kwargs below.
                vae_enabled=True,
                backend="inductor",
                fullgraph=True,
                mode="default",
                dynamic=False,
            ),
        ),
        pipeline=PipelineSelection(
            # ``PipelineConfig.from_kwargs`` resolves the model-specific
            # pipeline-config class from ``model_path`` automatically, so we
            # don't need to set ``components.pipeline_config_path`` — the
            # model-specific VAE precision / decoder defaults are picked up
            # the same way the legacy example's
            # ``PipelineConfig.from_pretrained(model_root)`` did them.
            components=ComponentConfig(
                upsampler_weights=str(refine_upsampler_path),
                # Distilled has no refine LoRA — omit ``lora_path``.
            ),
            vae_tiling=False,
            preset_overrides={
                "refine": {
                    "enabled": True,
                    "num_inference_steps": 3,
                    "guidance_scale": 1.0,
                    "add_noise": True,
                },
            },
        ),
    )

    generator = VideoGenerator.from_config(generator_config)

    def build_request(out_path: Path, seed: int) -> GenerationRequest:
        return GenerationRequest(
            prompt=PROMPT,
            # distilled is CFG-free; no negative prompt
            negative_prompt="",
            sampling=SamplingConfig(
                num_videos_per_prompt=1,
                seed=seed,
                height=1280,
                width=832,
                num_frames=121,
                fps=24,
                num_inference_steps=8,
                guidance_scale=1.0,
            ),
            output=OutputConfig(
                output_path=str(out_path),
                save_video=True,
                return_frames=False,
            ),
            # LTX-2.3 i2v fields don't have first-class typed slots yet;
            # extensions is the documented bridge. ``ltx2_image_crf=0.0``
            # skips an extra JPEG re-encode of an already JPEG image.
            extensions={
                "ltx2_images": [(I2V_IMAGE, 0, 1.0)],
                "ltx2_image_crf": 0.0,
            },
        )

    warmup_runs = 2
    measured_runs = 2
    warmup_secs: list[float] = []
    measured_secs: list[float] = []
    stage_times: dict[str, list[float]] = {}
    stage_order: OrderedDict[str, None] = OrderedDict()

    try:
        for w in range(warmup_runs):
            print(f"\n[warmup {w + 1}/{warmup_runs}] compiling + generating…")
            t0 = time.perf_counter()
            generator.generate(
                build_request(
                    OUTPUT_DIR / f"_warmup_{w + 1}.mp4", seed=7
                )
            )
            dt = time.perf_counter() - t0
            warmup_secs.append(dt)
            print(f"[warmup {w + 1}/{warmup_runs}] wall={dt:.1f}s")

        for w in range(warmup_runs):
            (OUTPUT_DIR / f"_warmup_{w + 1}.mp4").unlink(missing_ok=True)

        for m in range(measured_runs):
            out_path = (
                OUTPUT_DIR
                / f"output_ltx2_3_distilled_i2v_typed_run_{m + 1}.mp4"
            )
            print(
                f"\n[measured {m + 1}/{measured_runs}] generating: {out_path}"
            )
            t0 = time.perf_counter()
            result = generator.generate(
                build_request(out_path, seed=2002 + m)
            )
            wall = time.perf_counter() - t0
            # ``e2e_latency`` is currently surfaced via ``result.extra``;
            # ``GenerationResult`` exposes ``generation_time`` as a
            # first-class field but the LTX-2 pipeline only fills the
            # legacy ``e2e_latency`` key. Prefer the explicit one, fall
            # back to wall-clock.
            e2e = (
                result.extra.get("e2e_latency")
                if hasattr(result, "extra") else None
            ) or wall
            measured_secs.append(e2e)
            print(
                f"[measured {m + 1}/{measured_runs}] "
                f"e2e={e2e:.2f}s wall={wall:.2f}s"
            )
            _print_stage_breakdown(result, f"measured {m + 1}")
            _collect_stage_times(result, stage_times, stage_order)

        print("\n=== summary ===")
        print(
            f"warmup wall-times:      "
            f"{[round(x, 1) for x in warmup_secs]}"
        )
        if measured_secs:
            avg = sum(measured_secs) / len(measured_secs)
            print(
                f"measured e2e (n={len(measured_secs)}): "
                f"{[round(x, 2) for x in measured_secs]} -> avg {avg:.2f}s"
            )
        if stage_times:
            print(f"average stage times over {measured_runs} measured runs:")
            avg_total = 0.0
            for name in stage_order:
                vals = stage_times.get(name) or []
                if not vals:
                    continue
                avg_v = sum(vals) / len(vals)
                avg_total += avg_v
                print(f"  - {name}: {avg_v:.3f}s")
            print(f"  - stage_sum_avg: {avg_total:.3f}s")
    finally:
        generator.shutdown()


if __name__ == "__main__":
    main()

basic_ltx2_distilled.py

from fastvideo import VideoGenerator

PROMPT = (
    "A warm sunny backyard. The camera starts in a tight cinematic close-up "
    "of a woman and a man in their 30s, facing each other with serious "
    "expressions. The woman, emotional and dramatic, says softly, \"That's "
    "it... Dad's lost it. And we've lost Dad.\" The man exhales, slightly "
    "annoyed: \"Stop being so dramatic, Jess.\" A beat. He glances aside, "
    "then mutters defensively, \"He's just having fun.\" The camera slowly "
    "pans right, revealing the grandfather in the garden wearing enormous "
    "butterfly wings, waving his arms in the air like he's trying to take "
    "off. He shouts, \"Wheeeew!\" as he flaps his wings with full commitment. "
    "The woman covers her face, on the verge of tears. The tone is deadpan, "
    "absurd, and quietly tragic."
)
import os
os.environ["FASTVIDEO_ATTENTION_BACKEND"] = "FLASH_ATTN"

def main() -> None:
    generator = VideoGenerator.from_pretrained(
        "FastVideo/LTX2-Distilled-Diffusers",
        num_gpus=4,
    )

    output_path = "outputs_video/ltx2_basic/output_ltx2_distilled_t2v.mp4"
    generator.generate_video(
        prompt=PROMPT,
        output_path=output_path,
        save_video=True,
    )
    generator.shutdown()


if __name__ == "__main__":
    main()

basic_ltx2_distilled_fast_profile.py

# SPDX-License-Identifier: Apache-2.0
import json
import os
import time
from collections import OrderedDict
from pathlib import Path

import torch
import torch._inductor.config
from fastvideo import VideoGenerator
from fastvideo.configs.pipelines.base import PipelineConfig
from fastvideo.layers.quantization.nvfp4_config import NVFP4Config
from fastvideo.utils import maybe_download_model

VALIDATION_JSON = (
    Path(__file__).resolve().parents[2] / "training" / "finetune" / "ltx2" / "validation.json"
)

# Override with a local snapshot or converted directory when needed, e.g.
#   export LTX2_MODEL_PATH=/raid/$USER/hf/FastVideo/LTX2-Distilled-Diffusers
MODEL_ID = os.path.expandvars(
    os.path.expanduser(os.getenv("LTX2_MODEL_PATH", "FastVideo/LTX2-Distilled-Diffusers"))
)
OUTPUT_DIR = Path("outputs_video/ltx2_distilled_fast_profile")

os.environ["FASTVIDEO_ATTENTION_BACKEND"] = "FLASH_ATTN"
os.environ["FASTVIDEO_STAGE_LOGGING"] = "1"

# Tune Inductor flags
config = torch._inductor.config
config.conv_1x1_as_mm = True  # treat 1x1 convolutions as matrix muls
config.coordinate_descent_tuning = True
config.coordinate_descent_check_all_directions = True
config.epilogue_fusion = False  # do not fuse pointwise ops into matmuls


def load_validation_entries(path: Path) -> list[dict]:
    with path.open(encoding="utf-8") as f:
        data = json.load(f)

    if isinstance(data, dict) and isinstance(data.get("data"), list):
        return [entry for entry in data["data"] if isinstance(entry, dict)]

    raise ValueError(f"Unsupported validation format in {path}. Expected {{'data': [...]}}.")


def print_stage_breakdown(
    result: dict,
    run_idx: int,
    num_runs: int,
) -> float | None:
    logging_info = result.get("logging_info")
    if logging_info is None:
        print(f"[{run_idx}/{num_runs}] Stage breakdown unavailable: no logging_info")
        return None

    stages = getattr(logging_info, "stages", None)
    if not stages:
        print(f"[{run_idx}/{num_runs}] Stage breakdown unavailable: no stage timings")
        return None

    print(f"[{run_idx}/{num_runs}] Stage breakdown:")
    total = 0.0
    for stage_name, stage_metrics in stages.items():
        exec_time = float(stage_metrics.get("execution_time", 0.0))
        total += exec_time
        print(f"  - {stage_name}: {exec_time:.3f}s")
    print(f"  - total(stage sum): {total:.3f}s")
    return total


def extract_sr_forward_latency(
    result: dict,
) -> tuple[float | None, list[tuple[str, float]], list[str]]:
    logging_info = result.get("logging_info")
    if logging_info is None:
        return None, [], []

    stages = getattr(logging_info, "stages", None)
    if not stages:
        return None, [], []

    stage_names = list(stages.keys())
    sr_match_substr = os.getenv("FASTVIDEO_SR_LATENCY_STAGE_SUBSTR", "").strip().lower()

    sr_stage_entries: list[tuple[str, float]] = []
    for stage_name, stage_metrics in stages.items():
        stage_name_l = stage_name.lower()
        if sr_match_substr:
            is_sr_stage = sr_match_substr in stage_name_l
        else:
            is_sr_stage = (
                "srdenoisingstage" in stage_name_l
                or "sr_denoising" in stage_name_l
                or "upsample" in stage_name_l
                or ("refine" in stage_name_l and "denois" in stage_name_l)
            )
        if not is_sr_stage:
            continue
        exec_time = float(stage_metrics.get("execution_time", 0.0))
        sr_stage_entries.append((stage_name, exec_time))

    if not sr_stage_entries:
        return None, [], stage_names
    return sum(x[1] for x in sr_stage_entries), sr_stage_entries, stage_names


def collect_stage_times(
    result: dict,
    stage_times: dict[str, list[float]],
    stage_order: OrderedDict[str, None],
) -> None:
    logging_info = result.get("logging_info")
    if logging_info is None:
        return
    stages = getattr(logging_info, "stages", None)
    if not stages:
        return
    for stage_name, stage_metrics in stages.items():
        stage_order.setdefault(stage_name, None)
        exec_time = float(stage_metrics.get("execution_time", 0.0))
        stage_times.setdefault(stage_name, []).append(exec_time)


def print_stage_averages(
    stage_times: dict[str, list[float]],
    stage_order: OrderedDict[str, None],
    measured_runs: int,
) -> None:
    if measured_runs <= 0:
        return
    if not stage_times:
        print("No stage timings collected for measured runs.")
        return

    print(f"Average stage times over {measured_runs} measured runs:")
    total_avg = 0.0
    for stage_name in stage_order.keys():
        times = stage_times.get(stage_name, [])
        if not times:
            continue
        avg = sum(times) / len(times)
        total_avg += avg
        print(f"  - {stage_name}: {avg:.3f}s")
    print(f"  - total(stage sum avg): {total_avg:.3f}s")


def resolve_refine_upsampler_path(model_root: str) -> Path:
    root = Path(model_root)
    candidates = [
        root / "spatial_upscaler",
        root / "spatial_upsampler",
    ]

    env_path = os.getenv("LTX2_REFINE_UPSAMPLER_PATH")
    if env_path:
        candidates.insert(0, Path(os.path.expandvars(os.path.expanduser(env_path))))

    for candidate in candidates:
        if (candidate / "config.json").is_file():
            return candidate

    checked = "\n".join(f"  - {candidate}" for candidate in candidates)
    raise FileNotFoundError(
        "Could not find an LTX2 refine upsampler directory.\n"
        "Checked:\n"
        f"{checked}"
    )


def main() -> None:
    if not VALIDATION_JSON.exists():
        raise FileNotFoundError(f"Validation file not found: {VALIDATION_JSON}")

    validation_entries = load_validation_entries(VALIDATION_JSON)
    if not validation_entries:
        raise ValueError(f"No validation entries found in {VALIDATION_JSON}")

    benchmark_entry = validation_entries[0]
    prompt = benchmark_entry.get("caption")
    if not isinstance(prompt, str) or not prompt.strip():
        raise ValueError("First validation entry is missing a usable caption")

    num_runs = 12
    warmup_runs = 2
    avg_window = num_runs - warmup_runs
    measured_start_idx = max(warmup_runs, num_runs - avg_window)

    model_root = maybe_download_model(MODEL_ID)
    refine_upsampler_path = resolve_refine_upsampler_path(model_root)
    print(f"Using refine upsampler: {refine_upsampler_path}")

    pipeline_config = PipelineConfig.from_pretrained(model_root)
    # LTX-2 NVFP4 deploy contract (train==deploy surface):
    #   * Linears: NVFP4 block-scaled GEMMs (per-16 E2M1 + E4M3 SFs) on every
    #     arch, via flashinfer.
    #   * ATTN_QAT_INFER attention differs per arch: sm_120a/sm_121a use the
    #     fastvideo-kernel CUTLASS (SageAttention3-FP4) scheme that
    #     ATTN_QAT_TRAIN simulates; sm_100a (GB200) / sm_103a (GB300) use the
    #     FP4 FA4 kernel (flash-attention-fp4) with per-16 block-scaled NVFP4
    #     Q/K and BF16 P/V -- a train-sim mismatch that is gated by MS-SSIM
    #     measurement, not assumed equal. The selection receipt is logged at
    #     backend resolution ("ATTN_QAT_INFER resolved: ...").
    # Original-weight retention: the default purges the always-FP4 layers'
    # bf16 originals after conversion. Refine-only layers (the cross-modal
    # AV projections) always keep theirs: the base stage profile runs them
    # dense by deployment contract -- in the two-stage fast profile AND the
    # distilled single-stage deploy. retain_original_weights=True keeps
    # everything (debugging).
    pipeline_config.dit_config.quant_config = NVFP4Config()
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
    torch_compile_kwargs = {
        "backend": "inductor",
        "fullgraph": True,
        # Uncomment for final best-performance profiling. It is disabled
        # for faster development iteration because autotuning is slow.
        # "mode": "max-autotune-no-cudagraphs",
        "dynamic": False,
    }

    generator = VideoGenerator.from_pretrained(
        model_root,
        num_gpus=1,
        ltx2_refine_enabled=True,
        ltx2_refine_upsampler_path=str(refine_upsampler_path),
        refine_lora_path="",  # keep refine LoRA disabled in this repo's typed adapter
        ltx2_refine_lora_path="",  # keep refine LoRA disabled for distilled model
        ltx2_refine_num_inference_steps=2,
        ltx2_refine_guidance_scale=1.0,
        ltx2_refine_add_noise=True,
        pipeline_config=pipeline_config,
        enable_torch_compile=True,
        enable_torch_compile_text_encoder=True,
        enable_torch_compile_vae=True,
        torch_compile_kwargs=torch_compile_kwargs,
        torch_compile_kwargs_vae=torch_compile_kwargs,
        dit_cpu_offload=False,
        text_encoder_cpu_offload=False,
        vae_cpu_offload=False,
        ltx2_vae_tiling=False,
    )

    run_times: list[float] = []
    e2e_times: list[float] = []
    sr_forward_times: list[float] = []
    non_stage_overhead_times: list[float] = []
    stage_times: dict[str, list[float]] = {}
    stage_order: OrderedDict[str, None] = OrderedDict()

    try:
        for i in range(num_runs):
            output_path = OUTPUT_DIR / f"output_ltx2_basic_t2v_run_{i + 1}.mp4"
            if output_path.exists():
                output_path.unlink()
                print(f"[{i + 1}/{num_runs}] Removed existing file: {output_path}")

            print(f"[{i + 1}/{num_runs}] Generating: {output_path}")
            if os.environ.get("FASTVIDEO_STAGE_LOGGING") == "0" and torch.cuda.is_available():
                torch.cuda.synchronize()

            start = time.perf_counter()
            result = generator.generate_video(
                prompt=prompt,
                output_path=str(output_path),
                fps=24,
                seed=10,
                save_video=True,
                guidance_scale=1.0,
                height=benchmark_entry.get("height", 1088),
                width=benchmark_entry.get("width", 1920),
                num_frames=121,
                num_inference_steps=5,
                # image_path="examples/inference/basic/prompt1.png",
                # ltx2_image_crf=0.0
            )
            if os.environ.get("FASTVIDEO_STAGE_LOGGING") == "0":
                torch.cuda.synchronize()

            elapsed = result.get("generation_time") if isinstance(result, dict) else None
            e2e_elapsed = result.get("e2e_latency") if isinstance(result, dict) else None
            if elapsed is None:
                elapsed = time.perf_counter() - start
            if e2e_elapsed is None:
                e2e_elapsed = time.perf_counter() - start

            run_times.append(elapsed)
            e2e_times.append(e2e_elapsed)
            print(f"[{i + 1}/{num_runs}] Generation time: {elapsed:.2f}s")
            print(f"[{i + 1}/{num_runs}] End-to-end latency: {e2e_elapsed:.2f}s")

            if isinstance(result, dict):
                stage_sum = print_stage_breakdown(result, i + 1, num_runs)
                if stage_sum is not None:
                    non_stage_overhead = e2e_elapsed - stage_sum
                    print(f"[{i + 1}/{num_runs}] Non-stage overhead (e2e - stage sum): {non_stage_overhead:.3f}s")
                    if i >= measured_start_idx:
                        non_stage_overhead_times.append(non_stage_overhead)

                sr_forward_total, sr_stage_entries, stage_names = extract_sr_forward_latency(result)
                if sr_forward_total is None:
                    print(f"[{i + 1}/{num_runs}] SR forward latency unavailable")
                    if stage_names:
                        print(f"    Available stage keys: {', '.join(stage_names)}")
                        print("    Tip: set FASTVIDEO_SR_LATENCY_STAGE_SUBSTR=<substring> to match your SR stage key.")
                else:
                    print(f"[{i + 1}/{num_runs}] SR forward latency: {sr_forward_total:.3f}s")
                    for sr_stage_name, sr_exec_time in sr_stage_entries:
                        print(f"    - {sr_stage_name}: {sr_exec_time:.3f}s")
                    if i >= measured_start_idx:
                        sr_forward_times.append(sr_forward_total)

                if i >= measured_start_idx:
                    collect_stage_times(result, stage_times, stage_order)

        measured_times = run_times[measured_start_idx:]
        avg_time = sum(measured_times) / len(measured_times)
        print(
            f"Average video generation time over {len(measured_times)} runs "
            f"(runs {measured_start_idx + 1}-{len(run_times)}, skipping first {warmup_runs} warmup runs): "
            f"{avg_time:.2f}s"
        )

        measured_e2e_times = e2e_times[measured_start_idx:]
        avg_e2e_time = sum(measured_e2e_times) / len(measured_e2e_times)
        print(
            f"Average end-to-end latency over {len(measured_e2e_times)} runs "
            f"(runs {measured_start_idx + 1}-{len(e2e_times)}, skipping first {warmup_runs} warmup runs): "
            f"{avg_e2e_time:.2f}s"
        )

        if sr_forward_times:
            avg_sr_forward = sum(sr_forward_times) / len(sr_forward_times)
            print(f"Average SR forward latency over {len(sr_forward_times)} runs: {avg_sr_forward:.3f}s")
        else:
            print("Average SR forward latency unavailable (no SR stages matched).")

        print_stage_averages(stage_times, stage_order, len(measured_times))

        if non_stage_overhead_times:
            avg_non_stage_overhead = sum(non_stage_overhead_times) / len(non_stage_overhead_times)
            print(
                "Average non-stage overhead over "
                f"{len(non_stage_overhead_times)} measured runs: {avg_non_stage_overhead:.3f}s"
            )
        else:
            print("Average non-stage overhead unavailable (no stage timings).")
    finally:
        generator.shutdown()


if __name__ == "__main__":
    main()

basic_lucy_edit.py

from fastvideo import VideoGenerator

OUTPUT_PATH = "video_samples_lucy_edit"


def main():
    generator = VideoGenerator.from_pretrained(
        "decart-ai/Lucy-Edit-Dev",
        num_gpus=1,
        use_fsdp_inference=False,
        dit_cpu_offload=True,
        vae_cpu_offload=False,
        text_encoder_cpu_offload=True,
        pin_cpu_memory=True,
    )

    prompt = ("Change the apron and blouse to a classic clown costume: satin "
              "polka-dot jumpsuit in bright primary colors, ruffled white collar, "
              "oversized pom-pom buttons, white gloves, oversized red shoes, red "
              "foam nose; soft window light from left, eye-level medium shot.")
    video_path = "https://d2drjpuinn46lb.cloudfront.net/painter_original_edit.mp4"

    generator.generate_video(
        prompt,
        negative_prompt="",
        video_path=video_path,
        output_path=OUTPUT_PATH,
        save_video=True,
        height=480,
        width=832,
        num_frames=81,
        fps=24,
        guidance_scale=5.0,
    )


if __name__ == "__main__":
    main()

basic_matrixgame2.py

from fastvideo import VideoGenerator
from fastvideo.models.dits.matrixgame2.utils import create_action_presets

import torch

# Available variants: "base_distilled_model", "gta_distilled_model", "templerun_distilled_model"
# Each variant has different keyboard_dim:
#   - base_distilled_model: keyboard_dim=4
#   - gta_distilled_model: keyboard_dim=2
#   - templerun_distilled_model: keyboard_dim=7 (keyboard only, no mouse)
MODEL_VARIANT = "base_distilled_model"

# Variant-specific settings
VARIANT_CONFIG = {
    "base_distilled_model": {
        "model_path": "FastVideo/Matrix-Game-2.0-Base-Distilled-Diffusers",
        "keyboard_dim": 4,
        "image_url": "https://raw.githubusercontent.com/SkyworkAI/Matrix-Game/main/Matrix-Game-2/demo_images/universal/0000.png",
    },
    "gta_distilled_model": {
        "model_path": "FastVideo/Matrix-Game-2.0-GTA-Distilled-Diffusers",
        "keyboard_dim": 2,
        "image_url": "https://raw.githubusercontent.com/SkyworkAI/Matrix-Game/main/Matrix-Game-2/demo_images/gta_drive/0000.png",
    },
    "templerun_distilled_model": {
        "model_path": "FastVideo/Matrix-Game-2.0-TempleRun-Distilled-Diffusers",
        "keyboard_dim": 7,
        "image_url": "https://raw.githubusercontent.com/SkyworkAI/Matrix-Game/main/Matrix-Game-2/demo_images/temple_run/0000.png",
    },
}


OUTPUT_PATH = "video_samples_matrixgame2"
def main():
    # FastVideo will automatically use the optimal default arguments for the
    # model.
    # If a local path is provided, FastVideo will make a best effort
    # attempt to identify the optimal arguments.
    config = VARIANT_CONFIG[MODEL_VARIANT]

    generator = VideoGenerator.from_pretrained(
        config["model_path"],
        # FastVideo will automatically handle distributed setup
        num_gpus=1,
        use_fsdp_inference=False, # set to True if GPU is out of memory
        dit_cpu_offload=True, # DiT need to be offloaded for MoE
        vae_cpu_offload=False,
        text_encoder_cpu_offload=True,
        # Set pin_cpu_memory to false if CPU RAM is limited and there're no frequent CPU-GPU transfer
        pin_cpu_memory=True,
        # image_encoder_cpu_offload=False,
    )

    num_frames = 597
    actions = create_action_presets(num_frames, keyboard_dim=config["keyboard_dim"])
    grid_sizes = torch.tensor([150, 44, 80])

    generator.generate_video(
        prompt="",
        image_path=config["image_url"],
        mouse_cond=actions["mouse"].unsqueeze(0),
        keyboard_cond=actions["keyboard"].unsqueeze(0),
        grid_sizes=grid_sizes,
        num_frames=num_frames,
        height=352,
        width=640,
        num_inference_steps=50,
        output_path=OUTPUT_PATH,
        save_video=True,
    )


if __name__ == "__main__":
    main()

basic_matrixgame2_streaming.py

from fastvideo.entrypoints.streaming_generator import StreamingVideoGenerator
from fastvideo.models.dits.matrixgame2.utils import get_current_action_async, expand_action_to_frames

import torch
import asyncio

# Available variants: "base_distilled_model", "gta_distilled_model", "templerun_distilled_model"
# Each variant has different keyboard_dim:
#   - base_distilled_model: keyboard_dim=4
#   - gta_distilled_model: keyboard_dim=2
#   - templerun_distilled_model: keyboard_dim=7 (keyboard only, no mouse)
MODEL_VARIANT = "base_distilled_model"

# Variant-specific settings
VARIANT_CONFIG = {
    "base_distilled_model": {
        "model_path": "FastVideo/Matrix-Game-2.0-Base-Distilled-Diffusers",
        "keyboard_dim": 4,
        "mode": "universal",
        "image_url": "https://raw.githubusercontent.com/SkyworkAI/Matrix-Game/main/Matrix-Game-2/demo_images/universal/0000.png",
    },
    "gta_distilled_model": {
        "model_path": "FastVideo/Matrix-Game-2.0-GTA-Distilled-Diffusers",
        "keyboard_dim": 2,
        "mode": "gta_drive",
        "image_url": "https://raw.githubusercontent.com/SkyworkAI/Matrix-Game/main/Matrix-Game-2/demo_images/gta_drive/0000.png",
    },
    "templerun_distilled_model": {
        "model_path": "FastVideo/Matrix-Game-2.0-TempleRun-Distilled-Diffusers",
        "keyboard_dim": 7,
        "mode": "templerun",
        "image_url": "https://raw.githubusercontent.com/SkyworkAI/Matrix-Game/main/Matrix-Game-2/demo_images/temple_run/0000.png",
    },
}


OUTPUT_PATH = "video_samples_matrixgame2"
async def main():
    # FastVideo will automatically use the optimal default arguments for the
    # model.
    # If a local path is provided, FastVideo will make a best effort
    # attempt to identify the optimal arguments.
    config = VARIANT_CONFIG[MODEL_VARIANT]

    generator = StreamingVideoGenerator.from_pretrained(
        config["model_path"],
        # FastVideo will automatically handle distributed setup
        num_gpus=1,
        use_fsdp_inference=False, # set to True if GPU is out of memory
        dit_cpu_offload=True, # DiT need to be offloaded for MoE
        vae_cpu_offload=False,
        text_encoder_cpu_offload=True,
        # Set pin_cpu_memory to false if CPU RAM is limited and there're no frequent CPU-GPU transfer
        pin_cpu_memory=True,
        # image_encoder_cpu_offload=False,
    )

    max_blocks = 50
    num_frames = 597    
    actions = {
        "keyboard": torch.zeros((num_frames, config["keyboard_dim"])),
        "mouse": torch.zeros((num_frames, 2))
    }
    grid_sizes = torch.tensor([150, 44, 80])
    mode = config["mode"]

    generator.reset(
        prompt="",
        image_path=config["image_url"],
        mouse_cond=actions["mouse"].unsqueeze(0),
        keyboard_cond=actions["keyboard"].unsqueeze(0),
        grid_sizes=grid_sizes,
        num_frames=num_frames,
        height=352,
        width=640,
        num_inference_steps=50,
        output_path=OUTPUT_PATH,
        save_video=True,
    )
    print("Initialization complete.")

    for block_id in range(max_blocks):
        print(f"\n=== Block {block_id + 1}/{max_blocks} ===")

        action = await get_current_action_async(mode)
        keyboard_cond, mouse_cond = expand_action_to_frames(action, 12)
        await generator.step_async(keyboard_cond, mouse_cond)

        if (await asyncio.to_thread(input, "\nContinue? (y/n): ")).lower() == 'n':
            break

    # Save final video
    generator.finalize()
    generator.shutdown()


if __name__ == "__main__":
    asyncio.run(main())

basic_matrixgame3.py

from fastvideo import VideoGenerator

MODEL_PATH = "FastVideo/Matrix-Game-3.0-Base-Distilled-Diffusers"
IMAGE_URL = "https://raw.githubusercontent.com/SkyworkAI/Matrix-Game/main/Matrix-Game-3/demo_images/001/image.png"
PROMPT = "A colorful, animated cityscape with a gas station and various buildings."
OUTPUT_PATH = "video_samples_matrixgame3"


def main():
    generator = VideoGenerator.from_pretrained(
        MODEL_PATH,
        num_gpus=1,
        use_fsdp_inference=False,
        dit_cpu_offload=False,
        vae_cpu_offload=False,
        text_encoder_cpu_offload=True,
        pin_cpu_memory=True,
    )

    generator.generate_video(
        prompt=PROMPT,
        image_path=IMAGE_URL,
        height=720,
        width=1280,
        num_frames=57,
        num_inference_steps=3,
        guidance_scale=1.0,
        seed=42,
        output_path=OUTPUT_PATH,
        save_video=True,
    )


if __name__ == "__main__":
    main()

basic_mps.py

from fastvideo import VideoGenerator, PipelineConfig
from fastvideo.api.sampling_param import SamplingParam

def main():
    config = PipelineConfig.from_pretrained("Wan-AI/Wan2.1-T2V-1.3B-Diffusers")
    config.text_encoder_precisions = ["fp16"]

    generator = VideoGenerator.from_pretrained(
        "Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
        pipeline_config=config,
        use_fsdp_inference=False,      # Disable FSDP for MPS
        dit_cpu_offload=True,          
        text_encoder_cpu_offload=True,    
        pin_cpu_memory=True,           
        disable_autocast=False,        
        num_gpus=1,      
    )

    # Create sampling parameters with reduced number of frames
    sampling_param = SamplingParam.from_pretrained("Wan-AI/Wan2.1-T2V-1.3B-Diffusers")
    sampling_param.num_frames = 25  # Reduce from default 81 to 25 frames bc we have to use the SDPA attn backend for mps
    sampling_param.height = 256
    sampling_param.width = 256

    prompt = ("A curious raccoon peers through a vibrant field of yellow sunflowers, its eyes "
             "wide with interest. The playful yet serene atmosphere is complemented by soft "
             "natural light filtering through the petals. Mid-shot, warm and cheerful tones.")

    video = generator.generate_video(prompt, sampling_param=sampling_param)

    prompt2 = ("A majestic lion strides across the golden savanna, its powerful frame "
        "glistening under the warm afternoon sun. The tall grass ripples gently in "
        "the breeze, enhancing the lion's commanding presence. The tone is vibrant, "
        "embodying the raw energy of the wild. Low angle, steady tracking shot, "
        "cinematic.")

    video2 = generator.generate_video(prompt2, sampling_param=sampling_param)

if __name__ == "__main__":
    main()

basic_ray.py

from fastvideo import VideoGenerator

# from fastvideo.api.sampling_param import SamplingParam

OUTPUT_PATH = "video_samples"
def main():
    # FastVideo will automatically use the optimal default arguments for the
    # model.
    # If a local path is provided, FastVideo will make a best effort
    # attempt to identify the optimal arguments.
    generator = VideoGenerator.from_pretrained(
        "Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
        # FastVideo will automatically handle distributed setup
        num_gpus=2,
        use_fsdp_inference=True,
        dit_cpu_offload=False,
        vae_cpu_offload=False,
        text_encoder_cpu_offload=True,
        pin_cpu_memory=True, # set to false if low CPU RAM or hit obscure "CUDA error: Invalid argument"
        distributed_executor_backend="ray",
        # image_encoder_cpu_offload=False,
    )

    # Generate videos with the same simple API, regardless of GPU count
    prompt = (
        "A curious raccoon peers through a vibrant field of yellow sunflowers, its eyes "
        "wide with interest. The playful yet serene atmosphere is complemented by soft "
        "natural light filtering through the petals. Mid-shot, warm and cheerful tones."
    )
    video = generator.generate_video(prompt, output_path=OUTPUT_PATH, save_video=True)

    # Generate another video with a different prompt, without reloading the
    # model!
    prompt2 = (
        "A majestic lion strides across the golden savanna, its powerful frame "
        "glistening under the warm afternoon sun. The tall grass ripples gently in "
        "the breeze, enhancing the lion's commanding presence. The tone is vibrant, "
        "embodying the raw energy of the wild. Low angle, steady tracking shot, "
        "cinematic.")
    video2 = generator.generate_video(prompt2, output_path=OUTPUT_PATH, save_video=True)


if __name__ == "__main__":
    main()

basic_sd35_t2i.py

# SPDX-License-Identifier: Apache-2.0

from __future__ import annotations

import argparse
import os
import re
from typing import List


DEFAULT_PROMPTS = [
    "a photo of a cat",
    "a cinematic photo of a red panda wearing a tiny backpack, standing on a rainy neon-lit street at night, shallow depth of field, sharp focus, 35mm, bokeh",
]


def _safe_filename(text: str, max_len: int = 100) -> str:
    """
    Make a stable, filesystem-friendly filename base.
    VideoGenerator uses prompt[:100].strip() internally, so we mirror that,
    but also remove path separators and other problematic characters.
    """
    s = text[:max_len].strip()
    s = s.replace(os.sep, "_")
    if os.altsep:
        s = s.replace(os.altsep, "_")
    s = re.sub(r"\s+", " ", s)
    s = re.sub(r"[^A-Za-z0-9 .,_-]", "_", s)
    s = s.strip(" .")
    return s or "prompt"


def _remove_existing_outputs(out_dir: str, filename_base: str) -> None:
    """
    Ensure deterministic naming by deleting any existing outputs that would
    cause VideoGenerator to append suffixes like _1, _2, etc.
    """
    if not os.path.isdir(out_dir):
        return

    pattern = re.compile(rf"^{re.escape(filename_base)}(_\d+)?\.(mp4|png)$")
    for fn in os.listdir(out_dir):
        if pattern.match(fn):
            try:
                os.remove(os.path.join(out_dir, fn))
            except FileNotFoundError:
                pass


def parse_args() -> argparse.Namespace:
    p = argparse.ArgumentParser(description="Run SD3.5 Medium text-to-image with FastVideo VideoGenerator.")
    p.add_argument("--model-path", default="stabilityai/stable-diffusion-3.5-medium", help="Path to local diffusers-format SD3.5 weights directory.")
    p.add_argument(
        "--out-dir",
        "--outdir",
        default="outputs/sd35/samples",
        help="Output directory for generated mp4 files.",
    )
    p.add_argument(
        "--prompt",
        action="append",
        default=None,
        help="Prompt text. Repeat --prompt multiple times to generate multiple samples.",
    )
    p.add_argument("--negative", default="lowres, blurry, jpeg artifacts, watermark, text", help="Negative prompt.")
    p.add_argument(
        "--backend",
        default=None,
        help="Set FASTVIDEO_ATTENTION_BACKEND (e.g. TORCH_SDPA). If omitted, respects the existing env var.",
    )
    p.add_argument("--seed", type=int, default=42, help="Base seed. Each prompt uses seed + prompt_idx.")
    p.add_argument("--height", type=int, default=768, help="Output height.")
    p.add_argument("--width", type=int, default=768, help="Output width.")
    p.add_argument("--steps", type=int, default=28, help="Number of inference steps.")
    p.add_argument("--guidance", type=float, default=6.0, help="Guidance scale.")
    p.add_argument("--num-gpus", type=int, default=1, help="Number of GPUs to use.")
    return p.parse_args()


def main() -> None:
    args = parse_args()
    prompts: List[str] = args.prompt if args.prompt else DEFAULT_PROMPTS

    if args.backend:
        os.environ["FASTVIDEO_ATTENTION_BACKEND"] = args.backend

    from fastvideo import VideoGenerator

    os.makedirs(args.out_dir, exist_ok=True)

    init_kwargs = {
        "num_gpus": args.num_gpus,
        "workload_type": "t2i",
        "sp_size": 1,
        "tp_size": 1,
        "dit_cpu_offload": False,
        "dit_layerwise_offload": False,
        "text_encoder_cpu_offload": False,
        "vae_cpu_offload": False,
        "image_encoder_cpu_offload": False,
        "pin_cpu_memory": False,
        "use_fsdp_inference": False,
    }

    generator = VideoGenerator.from_pretrained(model_path=args.model_path, **init_kwargs)
    try:
        for i, prompt in enumerate(prompts):
            seed = args.seed + i

            filename_base = f"sd35_{i:02d}_seed{seed}_{_safe_filename(prompt, max_len=80)}"
            _remove_existing_outputs(args.out_dir, filename_base)

            output_path = os.path.join(args.out_dir, f"{filename_base}.png")
            print(f"[sd35] prompt_idx={i} seed={seed} output_path={output_path}")

            generation_kwargs = {
                "output_path": output_path,
                "height": args.height,
                "width": args.width,
                "num_frames": 1,
                "fps": 1,
                "num_inference_steps": args.steps,
                "guidance_scale": args.guidance,
                "seed": seed,
                "negative_prompt": args.negative,
                "save_video": True,
            }

            generator.generate_video(prompt, **generation_kwargs)

        print(f"[sd35] done. outputs written to: {args.out_dir}")
    finally:
        generator.shutdown()


if __name__ == "__main__":
    main()

basic_self_forcing_causal.py

import os
import time
from fastvideo import VideoGenerator, SamplingParam

OUTPUT_PATH = "video_samples_causal"
def main():
    # FastVideo will automatically use the optimal default arguments for the
    # model.
    # If a local path is provided, FastVideo will make a best effort
    # attempt to identify the optimal arguments.
    model_name = "wlsaidhi/SFWan2.1-T2V-1.3B-Diffusers"
    generator = VideoGenerator.from_pretrained(
        model_name,
        # FastVideo will automatically handle distributed setup
        num_gpus=1,
        use_fsdp_inference=False, # set to True if GPU is out of memory
        text_encoder_cpu_offload=False,
        dit_cpu_offload=False,
    )

    sampling_param = SamplingParam.from_pretrained(model_name)

    prompt = (
        "A curious raccoon peers through a vibrant field of yellow sunflowers, its eyes "
        "wide with interest. The playful yet serene atmosphere is complemented by soft "
        "natural light filtering through the petals. Mid-shot, warm and cheerful tones."
    )
    video = generator.generate_video(prompt, output_path=OUTPUT_PATH, save_video=True, sampling_param=sampling_param)

if __name__ == "__main__":
    main()

basic_self_forcing_causal_wan2_2_i2v.py

# NOTE: This is still a work in progress, and the checkpoints are not released yet.

from fastvideo import VideoGenerator, SamplingParam
import json
# from fastvideo.api.sampling_param import SamplingParam

OUTPUT_PATH = "video_samples_self_forcing_causal_wan2_2_14B_i2v"
def main():
    # FastVideo will automatically use the optimal default arguments for the
    # model.
    # If a local path is provided, FastVideo will make a best effort
    # attempt to identify the optimal arguments.
    generator = VideoGenerator.from_pretrained(
        "FastVideo/SFWan2.2-I2V-A14B-Preview-Diffusers",
        # FastVideo will automatically handle distributed setup
        num_gpus=1,
        use_fsdp_inference=False, # set to True if GPU is out of memory
        dit_cpu_offload=True, # DiT need to be offloaded for MoE
        dit_precision="fp32",
        vae_cpu_offload=False,
        text_encoder_cpu_offload=True,
        dmd_denoising_steps=[1000, 850, 700, 550, 350, 275, 200, 125],
        # Set pin_cpu_memory to false if CPU RAM is limited and there're no frequent CPU-GPU transfer
        pin_cpu_memory=True,
        # image_encoder_cpu_offload=False,
    )

    sampling_param = SamplingParam.from_pretrained("FastVideo/SFWan2.2-I2V-A14B-Preview-Diffusers")
    sampling_param.num_frames = 81
    sampling_param.width = 832
    sampling_param.height = 480
    sampling_param.seed = 1000

    with open("assets/prompts/mixkit_i2v.jsonl", "r") as f:
        prompt_image_pairs = json.load(f)

    for prompt_image_pair in prompt_image_pairs:
        prompt = prompt_image_pair["prompt"]
        image_path = prompt_image_pair["image_path"]
        _ = generator.generate_video(prompt, image_path=image_path, output_path=OUTPUT_PATH, save_video=True, sampling_param=sampling_param)


if __name__ == "__main__":
    main()

basic_self_forcing_causal_wan2_2_t2v.py

# NOTE: This is still a work in progress, and the checkpoints are not released yet.

from fastvideo import VideoGenerator

# from fastvideo.api.sampling_param import SamplingParam

OUTPUT_PATH = "video_samples_self_forcing_causal_wan2_2_14B_t2v"
def main():
    # FastVideo will automatically use the optimal default arguments for the
    # model.
    # If a local path is provided, FastVideo will make a best effort
    # attempt to identify the optimal arguments.
    generator = VideoGenerator.from_pretrained(
        "rand0nmr/SFWan2.2-T2V-A14B-Diffusers",
        # FastVideo will automatically handle distributed setup
        num_gpus=1,
        use_fsdp_inference=False, # set to True if GPU is out of memory
        dit_cpu_offload=True, # DiT need to be offloaded for MoE
        vae_cpu_offload=False,
        text_encoder_cpu_offload=True,
        dmd_denoising_steps=[1000, 850, 700, 550, 350, 275, 200, 125],
        # Set pin_cpu_memory to false if CPU RAM is limited and there're no frequent CPU-GPU transfer
        pin_cpu_memory=True,
        init_weights_from_safetensors="/mnt/sharefs/users/hao.zhang/wei/SFwan2.2_distill_self_forcing_release_cfg2/checkpoint-246_weight_only/generator_inference_transformer/",
        init_weights_from_safetensors_2="/mnt/sharefs/users/hao.zhang/wei/SFwan2.2_distill_self_forcing_release_cfg2/checkpoint-246_weight_only/generator_2_inference_transformer/",
        num_frame_per_block=7,
        # image_encoder_cpu_offload=False,
    )

    # sampling_param = SamplingParam.from_pretrained("Wan-AI/Wan2.1-T2V-1.3B-Diffusers")
    # sampling_param.num_frames = 45
    # sampling_param.image_path = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/astronaut.jpg"
    # Generate videos with the same simple API, regardless of GPU count
    prompt = (
        "A curious raccoon peers through a vibrant field of yellow sunflowers, its eyes "
        "wide with interest. The playful yet serene atmosphere is complemented by soft "
        "natural light filtering through the petals. Mid-shot, warm and cheerful tones."
    )
    _ = generator.generate_video(prompt, output_path=OUTPUT_PATH, save_video=True, num_frames=81)


if __name__ == "__main__":
    main()

basic_stable_audio.py

# SPDX-License-Identifier: Apache-2.0
"""Stable Audio Open 1.0 — text-to-audio (baseline) example.

User story (game-audio designer, prototyping):
    "I'm prototyping a level and I need 6 seconds of background
    ambience — gentle wind, distant thunder, a hint of birdsong. I
    don't want to dig through a sound library; I want to type what I
    hear in my head and get a wav back. If it's wrong I'll iterate
    on the prompt. This is the first stop."

User story (musician sketching ideas):
    "I want to bounce a 30s lo-fi drum loop to use as a placeholder
    bed while I build the rest of the track. Type prompt, get audio,
    drop into the DAW. The actual production beat I'll record
    myself, but I need *something* to write the chords against."

User story (researcher exploring the model):
    "First time touching Stable Audio Open — what does it sound
    like at default settings? This is the smallest amount of code
    that goes from prompt to mp4."

How it works:
    Pure text-to-audio (T2A). The pipeline runs:
        T5 + NumberConditioner -> StableAudioDiT -> Oobleck VAE
    via the `dpmpp-3m-sde` k-diffusion sampler. All components are
    FastVideo-native — no diffusers / transformers model imports at
    runtime (see REVIEW item 30). Mirrors upstream
    `stable_audio_tools.inference.generation.generate_diffusion_cond`
    bit-for-bit (~0.2% abs_mean drift on 25 steps).

Tunable knobs (the "creative dials"):
    audio_end_in_s
        1–6   — quick ideation (sub-10s wall clock at 100 steps)
        10–30 — full musical phrase / loop length (the README example
                uses 30s)
        47.5  — model maximum (full sample_size = 2097152 / 44100 Hz)
    num_inference_steps
        25  — fast preview, occasional artifacts
        100 — preset default (matches the HF model card)
        250 — diminishing returns past here
    guidance_scale
        3   — looser, more variation per seed
        7   — preset default; matches README
        12+ — sharper but can sound "fried"

Prerequisites:
  1. Accept the terms on https://huggingface.co/stabilityai/stable-audio-open-1.0
     and export your HF token in the shell:
         export HF_TOKEN=hf_...
  2. Install optional inference deps (one-time):
         uv pip install k_diffusion einops_exts alias_free_torch torchsde
"""
from fastvideo import VideoGenerator

PROMPT = "Lo-fi hip hop instrumental with vinyl crackle and gentle piano."


def main() -> None:
    generator = VideoGenerator.from_pretrained(
        "FastVideo/stable-audio-open-1.0-Diffusers",
        num_gpus=1,
    )
    output_path = "outputs_audio/stable_audio_basic/output_stable_audio.wav"
    generator.generate_video(
        prompt=PROMPT,
        output_path=output_path,
        save_video=True,
        # 6-second clip; the model max is ~47.5s.
        audio_end_in_s=6.0,
        # The registered preset gives 100 steps + CFG=7.0 by default;
        # override num_inference_steps / guidance_scale here for QA.
    )
    generator.shutdown()


if __name__ == "__main__":
    main()

basic_stable_audio_a2a.py

# SPDX-License-Identifier: Apache-2.0
"""Stable Audio Open 1.0 — audio-to-audio variation example.

User story (musician, late at night):
    "I generated this 12-second lo-fi loop earlier and I love the chord
    progression and overall vibe, but the snare hit at 0:08 sounds wrong
    and the rhythm feels stiff. I don't want to start over from scratch
    and lose what's working — I want the model to keep the harmony and
    mood but reroll the percussion + groove."

User story (sound designer, on a deadline):
    "I have one good 'sword clang' SFX. The art director wants 8 sibling
    variations that all feel like the same sword from different angles —
    same metal, same weight, slightly different impact. I'd rather
    refine my one good take than text-prompt my way through 50 misses."

Pass `init_audio=path/to/clip` (any wav/mp3/mp4/m4a/flac the standard
deps decode) and the model will use it as a starting point for the
text prompt instead of pure noise.

Picking `init_audio_strength` (0.0 to 1.0):

    Higher = closer to the source clip. Lower = more transformation.
    (Same convention as the "Input Audio Strength" slider in
    Stability's commercial Stable Audio web UI, so values transfer
    directly.)

      | strength | what you get                                       |
      |----------|----------------------------------------------------|
      |   1.00   | Output ≈ reference. No transformation.             |
      |   0.85   | Texture micro-variation only.                      |
      |   0.70   | Light reroll, same instruments.                    |
      |   0.60   | Default. Instrument identity is replaceable        |
      |          | (cello can take over from piano on the same notes).|
      |   0.50   | Heavy — only melody / chord progression survives.  |
      |   0.30   | Reference acts as a loose mood prompt.             |
      |   0.00   | Plain T2A — reference ignored.                     |

    Rule of thumb by intent:
      * "Fix one part of this clip"            -> 0.75 .. 0.85
      * "Same notes, different instrument"     -> 0.55 .. 0.65
      * "Same chord progression, new content"  -> 0.40 .. 0.55
      * "Use this as a loose mood prompt"      -> 0.20 .. 0.35

    If the reference timbre is bleeding through more than you want,
    lower it; if the structure is gone, raise it.

Prerequisites: same as `basic_stable_audio.py`.
"""
from fastvideo import VideoGenerator

PROMPT = "Change the piano to a cello playing the same notes"
# Path to any audio-bearing file (wav, mp3, mp4, m4a, flac, ...).
# Set to `None` to skip A2A and run plain T2A.
INIT_AUDIO_PATH: str | None = None
# Reference fidelity in [0, 1] -- higher = closer to source.
INIT_AUDIO_STRENGTH = 0.6


def main() -> None:
    generator = VideoGenerator.from_pretrained(
        "FastVideo/stable-audio-open-1.0-Diffusers",
        num_gpus=1,
    )
    generator.generate_video(
        prompt=PROMPT,
        output_path="outputs_audio/stable_audio_a2a/output_a2a.wav",
        save_video=True,
        audio_end_in_s=6.0,
        init_audio=INIT_AUDIO_PATH,
        init_audio_strength=INIT_AUDIO_STRENGTH,
    )
    generator.shutdown()


if __name__ == "__main__":
    main()

basic_stable_audio_inpaint.py

# SPDX-License-Identifier: Apache-2.0
"""Stable Audio Open 1.0 — inpainting / outpainting (loop extension) example.

User story (loop extension — the killer app):
    "I have a 6-second drum loop my client likes. They want it as
    background bed for a 30-second ad. I need it to loop seamlessly,
    but a hard cut every 6s sounds bad. Let me extend it to 30s,
    keeping the first 6s exactly as-is and letting the model continue
    the groove for the remaining 24s."

User story (audio repair):
    "There's a microphone bump at 0:14 in this 30-second field
    recording — really obvious in headphones. Mask out 0:13 to 0:15
    and let the model regenerate plausible ambience that blends in.
    Everything else stays exactly as I recorded it."

User story (transition smoothing):
    "I have two 10-second clips I want to crossfade. Mask out a 1s
    overlap region in the middle and let the model invent a coherent
    transition between the two."

How it works (RePaint-style blending):
    Stable Audio Open 1.0 wasn't trained as an inpainting model
    (`model_type=diffusion_cond`, not `diffusion_cond_inpaint`), so we
    can't use the upstream's mask-conditioned approach directly. We
    use the RePaint trick instead, which works on any v-prediction
    diffusion model:

      1. Encode the reference clip into latent space.
      2. At every denoising step `i`, replace the kept region of the
         in-flight latent (where mask == 1) with the reference
         re-noised to the next timestep's sigma. Only the unkept
         region (mask == 0) is freely denoised.
      3. After the loop, the kept region is exactly the reference;
         the unkept region is freshly generated content.

    This is approximate compared to a properly trained inpainting
    checkpoint — the seam between kept/unkept can have slight EQ
    discontinuity — but it works on the existing public model.

Tunable: the mask is a 1-D tensor in {0, 1} at the model's sample
rate. Conventions:
    1.0 = keep this sample from the reference
    0.0 = regenerate this sample

Prerequisites: same as `basic_stable_audio.py`.
"""
import os

from fastvideo import VideoGenerator

PROMPT = "Steady lo-fi hip hop drum loop with vinyl crackle."
# Required: path to the reference audio file (wav, mp3, mp4, m4a, flac,
# ...) you want to extend or repair. The pipeline raises if a mask is
# passed without a reference, so this must be a real path.
REFERENCE_AUDIO_PATH = "path/to/your/loop.wav"
KEEP_SECONDS = 6.0       # first KEEP_SECONDS preserved exactly
TOTAL_SECONDS = 12.0     # extend the loop to this duration


def main() -> None:
    if not os.path.isfile(REFERENCE_AUDIO_PATH):
        raise FileNotFoundError(
            f"REFERENCE_AUDIO_PATH={REFERENCE_AUDIO_PATH!r} does not exist. "
            "Edit this script to point at a real audio file (wav/mp3/mp4/"
            "m4a/flac) before running.")
    generator = VideoGenerator.from_pretrained(
        "FastVideo/stable-audio-open-1.0-Diffusers",
        num_gpus=1,
    )
    generator.generate_video(
        prompt=PROMPT,
        output_path="outputs_audio/stable_audio_inpaint/output_inpaint.wav",
        save_video=True,
        audio_end_in_s=TOTAL_SECONDS,
        inpaint_audio=REFERENCE_AUDIO_PATH,
        # Tuple form: keep first KEEP_SECONDS, regenerate the rest.
        inpaint_mask=(KEEP_SECONDS, TOTAL_SECONDS),
    )
    generator.shutdown()


if __name__ == "__main__":
    main()

basic_stable_audio_small.py

# SPDX-License-Identifier: Apache-2.0
"""Stable Audio Open Small — fast / lightweight T2A example.

User story (interactive UI builder):
    "I'm building a sound-design UI where the user types a prompt and
    we want sub-2-second feedback so the experience feels like
    autocomplete, not a render queue. The full Stable Audio Open 1.0
    takes ~8s on a single GPU; the small variant takes a fraction of
    that — quality is lower but completely usable for real-time
    iteration."

User story (overnight batch jobs):
    "I'm generating 10,000 short SFX variants for a procedural game.
    Wall-clock matters more than per-clip polish — give me the small
    model so I can fit the run in one night instead of a week."

How it works:
    The small variant is a separate Stability AI checkpoint
    (`stabilityai/stable-audio-open-small`) that ships the same Oobleck
    VAE as the 1.0 base model but a smaller / faster DiT (`embed_dim=1024`,
    `depth=16`, `qk_norm="ln"`) and only one duration conditioner
    (`seconds_total`, no `seconds_start`). FastVideo loads from the
    converted Diffusers-format repo `FastVideo/stable-audio-open-small-Diffusers`
    via the standard component loader; per-variant arch fields come
    from `transformer/config.json` and `conditioner/config.json`.

Prerequisites: same as `basic_stable_audio.py`. The converted repo is
public so no gated-access flow is required.
"""
from fastvideo import VideoGenerator

PROMPT = "Lo-fi hip hop instrumental with vinyl crackle and gentle piano."


def main() -> None:
    generator = VideoGenerator.from_pretrained(
        "FastVideo/stable-audio-open-small-Diffusers",
        num_gpus=1,
    )
    output_path = "outputs_audio/stable_audio_small/output_stable_audio_small.wav"
    generator.generate_video(
        prompt=PROMPT,
        output_path=output_path,
        save_video=True,
        # Small variant trains on a ~11.9s window — keep `audio_end_in_s`
        # at or below that.
        audio_end_in_s=6.0,
    )
    generator.shutdown()


if __name__ == "__main__":
    main()

basic_turbodiffusion.py

import os

# Set SLA attention backend BEFORE fastvideo imports
os.environ["FASTVIDEO_ATTENTION_BACKEND"] = "SLA_ATTN"

from fastvideo import VideoGenerator

OUTPUT_PATH = "video_samples_turbodiffusion"


def main() -> None:
    # TurboDiffusion: 1-4 step video generation using RCM scheduler + SLA attention
    # FastVideo will automatically use TurboDiffusionPipeline when specified
    generator = VideoGenerator.from_pretrained(
        "loayrashid/TurboWan2.1-T2V-1.3B-Diffusers",
        # FastVideo will automatically handle distributed setup
        num_gpus=1,
        use_fsdp_inference=False, # set to True if GPU is out of memory

        # set to false if using RTX 4090 
        # pin_cpu_memory=False,
    )

    # Generate videos with the same simple API, regardless of GPU count
    # TurboDiffusion defaults: guidance_scale=1.0 and num_inference_steps=4 (from config)
    prompt = (
        "A curious raccoon peers through a vibrant field of yellow sunflowers, its eyes "
        "wide with interest. The playful yet serene atmosphere is complemented by soft "
        "natural light filtering through the petals. Mid-shot, warm and cheerful tones."
    )
    video = generator.generate_video(
        prompt,
        output_path=OUTPUT_PATH,
        save_video=True,
        seed=42,
    )

    # Generate another video with a different prompt, without reloading the model!
    prompt2 = (
        "A majestic lion strides across the golden savanna, its powerful frame "
        "glistening under the warm afternoon sun. The tall grass ripples gently in "
        "the breeze, enhancing the lion's commanding presence. The tone is vibrant, "
        "embodying the raw energy of the wild. Low angle, steady tracking shot, "
        "cinematic."
    )
    video2 = generator.generate_video(
        prompt2,
        output_path=OUTPUT_PATH,
        save_video=True,
        seed=42,
    )


if __name__ == "__main__":
    main()

basic_turbodiffusion_14b.py

import os

# Set SLA attention backend BEFORE fastvideo imports
os.environ["FASTVIDEO_ATTENTION_BACKEND"] = "SLA_ATTN"

from fastvideo import VideoGenerator

OUTPUT_PATH = "video_samples_turbodiffusion_14B"


def main() -> None:
    # TurboDiffusion 14B: 1-4 step video generation using RCM scheduler + SLA attention
    # FastVideo will automatically use TurboDiffusionPipeline when specified
    generator = VideoGenerator.from_pretrained(
        "loayrashid/TurboWan2.1-T2V-14B-Diffusers",
        # 14B model needs more GPUs
        num_gpus=2,
    )

    prompt = (
        "A curious raccoon peers through a vibrant field of yellow sunflowers, its eyes "
        "wide with interest. The playful yet serene atmosphere is complemented by soft "
        "natural light filtering through the petals. Mid-shot, warm and cheerful tones."
    )
    video = generator.generate_video(
        prompt,
        output_path=OUTPUT_PATH,
        save_video=True,
        seed=42,
    )

    # Generate another video with a different prompt, without reloading the model!
    prompt2 = (
        "A majestic lion strides across the golden savanna, its powerful frame "
        "glistening under the warm afternoon sun. The tall grass ripples gently in "
        "the breeze, enhancing the lion's commanding presence. The tone is vibrant, "
        "embodying the raw energy of the wild. Low angle, steady tracking shot, "
        "cinematic."
    )
    video2 = generator.generate_video(
        prompt2,
        output_path=OUTPUT_PATH,
        save_video=True,
        seed=42,
    )


if __name__ == "__main__":
    main()

basic_turbodiffusion_i2v.py

import os

# Set SLA attention backend BEFORE fastvideo imports
os.environ["FASTVIDEO_ATTENTION_BACKEND"] = "SLA_ATTN"

from fastvideo import VideoGenerator

# Use local model path
MODEL_PATH = "loayrashid/TurboWan2.2-I2V-A14B-Diffusers"
OUTPUT_PATH = "video_samples_turbodiffusion_i2v"


def main() -> None:
    # TurboDiffusion I2V: 1-4 step image-to-video generation
    generator = VideoGenerator.from_pretrained(
        MODEL_PATH,
        num_gpus=2,
    )

    # Example prompt and image for I2V
    prompt = ("Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside.")

    # Use an example image path
    image_path = "https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/wan_i2v_input.JPG"


    video = generator.generate_video(
        prompt,
        image_path=image_path,
        output_path=OUTPUT_PATH,
        save_video=True,
        seed=42,
    )


if __name__ == "__main__":
    main()

basic_wan2_2.py

from fastvideo import VideoGenerator

# from fastvideo.api.sampling_param import SamplingParam

OUTPUT_PATH = "video_samples_wan2_2_14B_t2v"
def main():
    # FastVideo will automatically use the optimal default arguments for the
    # model.
    # If a local path is provided, FastVideo will make a best effort
    # attempt to identify the optimal arguments.
    generator = VideoGenerator.from_pretrained(
        "Wan-AI/Wan2.2-T2V-A14B-Diffusers",
        # FastVideo will automatically handle distributed setup
        num_gpus=2,
        use_fsdp_inference=False, # set to True if GPU is out of memory 
        dit_cpu_offload=True, # DiT need to be offloaded for MoE
        vae_cpu_offload=False,
        text_encoder_cpu_offload=True,
        # Set pin_cpu_memory to false if CPU RAM is limited and there're no frequent CPU-GPU transfer
        pin_cpu_memory=True,
        # image_encoder_cpu_offload=False,
    )

    # sampling_param = SamplingParam.from_pretrained("Wan-AI/Wan2.1-T2V-1.3B-Diffusers")
    # sampling_param.num_frames = 45
    # sampling_param.image_path = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/astronaut.jpg"
    # Generate videos with the same simple API, regardless of GPU count
    prompt = (
        "A curious raccoon peers through a vibrant field of yellow sunflowers, its eyes "
        "wide with interest. The playful yet serene atmosphere is complemented by soft "
        "natural light filtering through the petals. Mid-shot, warm and cheerful tones."
    )
    _ = generator.generate_video(prompt, output_path=OUTPUT_PATH, save_video=True, height=720, width=1280, num_frames=81)
    # video = generator.generate_video(prompt, sampling_param=sampling_param, output_path="wan_t2v_videos/")

    # Generate another video with a different prompt, without reloading the
    # model!
    prompt2 = (
        "A majestic lion strides across the golden savanna, its powerful frame "
        "glistening under the warm afternoon sun. The tall grass ripples gently in "
        "the breeze, enhancing the lion's commanding presence. The tone is vibrant, "
        "embodying the raw energy of the wild. Low angle, steady tracking shot, "
        "cinematic.")
    _ = generator.generate_video(prompt2, output_path=OUTPUT_PATH, save_video=True, height=720, width=1280, num_frames=81)


if __name__ == "__main__":
    main()

basic_wan2_2_Fun.py

from fastvideo import VideoGenerator

# from fastvideo.api.sampling_param import SamplingParam

OUTPUT_PATH = "video_samples_wan2_1_Fun"
OUTPUT_NAME = "wan2.1_test"
def main():
    # FastVideo will automatically use the optimal default arguments for the
    # model.
    # If a local path is provided, FastVideo will make a best effort
    # attempt to identify the optimal arguments.
    generator = VideoGenerator.from_pretrained(
        "IRMChen/Wan2.1-Fun-1.3B-Control-Diffusers",
        # "alibaba-pai/Wan2.2-Fun-A14B-Control",
        # FastVideo will automatically handle distributed setup
        num_gpus=1,
        use_fsdp_inference=False, # set to True if GPU is out of memory
        dit_cpu_offload=True, # DiT need to be offloaded for MoE
        vae_cpu_offload=False,
        text_encoder_cpu_offload=True,
        # Set pin_cpu_memory to false if CPU RAM is limited and there're no frequent CPU-GPU transfer
        pin_cpu_memory=True,
        # image_encoder_cpu_offload=False,
    )

    prompt = "一位年轻女性穿着一件粉色的连衣裙，裙子上有白色的装饰和粉色的纽扣。她的头发是紫色的，头上戴着一个红色的大蝴蝶结，显得非常可爱和精致。她还戴着一个红色的领结，整体造型充满了少女感和活力。她的表情温柔，双手轻轻交叉放在身前，姿态优雅。背景是简单的灰色，没有任何多余的装饰，使得人物更加突出。她的妆容清淡自然，突显了她的清新气质。整体画面给人一种甜美、梦幻的感觉，仿佛置身于童话世界中。"
    negative_prompt = "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走"
    # prompt                  = "A young woman with beautiful, clear eyes and blonde hair stands in the forest, wearing a white dress and a crown. Her expression is serene, reminiscent of a movie star, with fair and youthful skin. Her brown long hair flows in the wind. The video quality is very high, with a clear view. High quality, masterpiece, best quality, high resolution, ultra-fine, fantastical."
    # negative_prompt         = "Twisted body, limb deformities, text captions, comic, static, ugly, error, messy code."
    image_path = "https://pai-aigc-photog.oss-cn-hangzhou.aliyuncs.com/wan_fun/asset_Wan2_2/v1.0/8.png"
    control_video_path = "https://pai-aigc-photog.oss-cn-hangzhou.aliyuncs.com/wan_fun/asset_Wan2_2/v1.0/pose.mp4"

    video = generator.generate_video(prompt, negative_prompt=negative_prompt, image_path=image_path, video_path=control_video_path, output_path=OUTPUT_PATH, output_video_name=OUTPUT_NAME, save_video=True)

if __name__ == "__main__":
    main()

basic_wan2_2_i2v.py

from fastvideo import VideoGenerator

# from fastvideo.api.sampling_param import SamplingParam

OUTPUT_PATH = "video_samples_wan2_2_14B_i2v"
def main():
    # FastVideo will automatically use the optimal default arguments for the
    # model.
    # If a local path is provided, FastVideo will make a best effort
    # attempt to identify the optimal arguments.
    generator = VideoGenerator.from_pretrained(
        "Wan-AI/Wan2.2-I2V-A14B-Diffusers",
        # FastVideo will automatically handle distributed setup
        num_gpus=1,
        use_fsdp_inference=False, # set to True if GPU is out of memory
        dit_cpu_offload=True, # DiT need to be offloaded for MoE
        vae_cpu_offload=False,
        text_encoder_cpu_offload=True,
        # Set pin_cpu_memory to false if CPU RAM is limited and there're no frequent CPU-GPU transfer
        pin_cpu_memory=True,
        # image_encoder_cpu_offload=False,
    )

    prompt = "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside."
    image_path = "https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/wan_i2v_input.JPG"

    video = generator.generate_video(prompt, image_path=image_path, output_path=OUTPUT_PATH, save_video=True, height=832, width=480, num_frames=81)

if __name__ == "__main__":
    main()

basic_wan2_2_ti2v.py

from fastvideo import VideoGenerator

OUTPUT_PATH = "video_samples_wan2_2_5B_ti2v"
def main():
    # FastVideo will automatically use the optimal default arguments for the
    # model.
    # If a local path is provided, FastVideo will make a best effort
    # attempt to identify the optimal arguments.
    model_name = "Wan-AI/Wan2.2-TI2V-5B-Diffusers"
    generator = VideoGenerator.from_pretrained(
        model_name,
        # FastVideo will automatically handle distributed setup
        num_gpus=1,
        use_fsdp_inference=False, # set to True if GPU is out of memory
        dit_cpu_offload=True,
        vae_cpu_offload=False,
        text_encoder_cpu_offload=True,
        pin_cpu_memory=True, # set to false if low CPU RAM or hit obscure "CUDA error: Invalid argument"
        # image_encoder_cpu_offload=False,
    )

    # I2V is triggered just by passing in an image_path argument
    prompt = "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside."
    image_path = "https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/wan_i2v_input.JPG"
    video = generator.generate_video(prompt, output_path=OUTPUT_PATH, save_video=True, image_path=image_path)

    # Generate another video with a different prompt, without reloading the
    # model!

    # T2V mode
    prompt2 = (
        "A majestic lion strides across the golden savanna, its powerful frame "
        "glistening under the warm afternoon sun. The tall grass ripples gently in "
        "the breeze, enhancing the lion's commanding presence. The tone is vibrant, "
        "embodying the raw energy of the wild. Low angle, steady tracking shot, "
        "cinematic.")
    video2 = generator.generate_video(prompt2, output_path=OUTPUT_PATH, save_video=True)


if __name__ == "__main__":
    main()

basic_zimage.py

# SPDX-License-Identifier: Apache-2.0
"""Run Z-Image-Turbo text-to-image generation through FastVideo.

User story:
    "I want the official Z-Image-Turbo defaults and a deterministic PNG from
    a local or Hugging Face checkpoint."
"""

import argparse
from pathlib import Path

from fastvideo import VideoGenerator
from fastvideo.api import (
    EngineConfig,
    GenerationRequest,
    GeneratorConfig,
    OutputConfig,
    ParallelismConfig,
    PipelineSelection,
    SamplingConfig,
)


DEFAULT_PROMPT = (
    "Young Chinese woman in red Hanfu, intricate embroidery. Impeccable makeup, red floral forehead pattern. "
    "Elaborate high bun, golden phoenix headdress, red flowers, beads. Holds round folding fan with lady, trees, bird. "
    "Neon lightning-bolt lamp (⚡️), bright yellow glow, above extended left palm. Soft-lit outdoor night background, "
    "silhouetted tiered pagoda (西安大雁塔), blurred colorful distant lights."
)
DEFAULT_REVISION = "f332072aa78be7aecdf3ee76d5c247082da564a6"


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Run Z-Image-Turbo text-to-image generation.")
    parser.add_argument("--model-path", default="Tongyi-MAI/Z-Image-Turbo")
    parser.add_argument("--revision", default=DEFAULT_REVISION)
    parser.add_argument("--output", default="outputs/zimage/zimage_turbo.png")
    parser.add_argument("--prompt", default=DEFAULT_PROMPT)
    parser.add_argument("--negative-prompt", default="")
    parser.add_argument("--height", type=int, default=1024)
    parser.add_argument("--width", type=int, default=1024)
    parser.add_argument("--steps", type=int, default=8)
    parser.add_argument("--guidance-scale", type=float, default=0.0)
    parser.add_argument("--max-sequence-length", type=int, default=512)
    parser.add_argument("--cfg-normalization", action=argparse.BooleanOptionalAction, default=False)
    parser.add_argument("--cfg-truncation", type=float, default=1.0)
    parser.add_argument("--seed", type=int, default=42)
    return parser.parse_args()


def main() -> None:
    args = parse_args()
    output = Path(args.output)
    output.parent.mkdir(parents=True, exist_ok=True)

    generator = VideoGenerator.from_config(
        GeneratorConfig(
            model_path=args.model_path,
            revision=args.revision,
            engine=EngineConfig(
                num_gpus=1,
                parallelism=ParallelismConfig(tp_size=1, sp_size=1),
                use_fsdp_inference=False,
            ),
            # The model registry selects the native zimage_turbo preset.
            pipeline=PipelineSelection(workload_type="t2i"),
        ))
    try:
        generator.generate(
            GenerationRequest(
                prompt=args.prompt,
                negative_prompt=args.negative_prompt,
                sampling=SamplingConfig(
                    height=args.height,
                    width=args.width,
                    num_frames=1,
                    fps=1,
                    num_inference_steps=args.steps,
                    guidance_scale=args.guidance_scale,
                    max_sequence_length=args.max_sequence_length,
                    cfg_normalization=args.cfg_normalization,
                    cfg_truncation=args.cfg_truncation,
                    seed=args.seed,
                ),
                output=OutputConfig(
                    output_path=str(output),
                    save_video=True,
                    return_frames=False,
                ),
            ))
    finally:
        generator.shutdown()


if __name__ == "__main__":
    main()

edit_glm_image.py

# SPDX-License-Identifier: Apache-2.0
"""Run GLM-Image image-to-image (edit) generation through FastVideo.

User story:
    "I have the HF `zai-org/GLM-Image` checkpoint and a condition image, and
    want a minimal edit command (text + image -> edited image), saved as a PNG."

GLM-Image is a single unified pipeline: passing a condition image switches it
from text-to-image to the edit path (the condition enters the DiT via a KV-cache
write pass), so the generator config is identical to `basic_glm_image.py` — the
`inputs.pil_image` on the request is what selects the edit mode.
"""
import argparse
from pathlib import Path

from PIL import Image

from fastvideo import VideoGenerator
from fastvideo.api import (
    EngineConfig,
    GenerationRequest,
    GeneratorConfig,
    InputConfig,
    OutputConfig,
    ParallelismConfig,
    PipelineSelection,
    SamplingConfig,
)


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Run GLM-Image image-to-image (edit) generation.")
    parser.add_argument(
        "--model-path",
        default="zai-org/GLM-Image",
        help="HF id or local diffusers-format GLM-Image weights directory.",
    )
    parser.add_argument(
        "--image",
        default="assets/images/couple.jpg",
        help="Condition image to edit.",
    )
    parser.add_argument(
        "--output",
        default="image_output/edited.png",
        help="Output PNG path.",
    )
    parser.add_argument(
        "--prompt",
        default="Change the background to a snowy mountain landscape at golden hour.",
        help="Edit instruction.",
    )
    parser.add_argument("--height", type=int, default=1024)
    parser.add_argument("--width", type=int, default=1024)
    parser.add_argument("--steps", type=int, default=50)
    parser.add_argument("--guidance-scale", type=float, default=1.5)
    parser.add_argument("--seed", type=int, default=1024)
    parser.add_argument("--num-gpus", type=int, default=1)
    parser.add_argument("--tp-size", type=int, default=None)
    parser.add_argument("--sp-size", type=int, default=None)
    return parser.parse_args()


def main() -> None:
    args = parse_args()

    output = Path(args.output)
    output.parent.mkdir(parents=True, exist_ok=True)
    condition = Image.open(args.image).convert("RGB")
    tp_size = args.tp_size if args.tp_size is not None else (args.num_gpus if args.num_gpus > 1 else 1)
    sp_size = args.sp_size if args.sp_size is not None else (1 if args.num_gpus > 1 else args.num_gpus)

    # GLM-Image needs trust_remote_code for its AR encoder; offload and the
    # pipeline class come from the model's registered defaults — don't override.
    # The pipeline is registered as t2i; passing inputs.pil_image below switches
    # it to the edit path.
    generator_config = GeneratorConfig(
        model_path=args.model_path,
        trust_remote_code=True,
        engine=EngineConfig(
            num_gpus=args.num_gpus,
            parallelism=ParallelismConfig(tp_size=tp_size, sp_size=sp_size),
        ),
        pipeline=PipelineSelection(workload_type="t2i"),
    )

    generator = VideoGenerator.from_config(generator_config)
    try:
        request = GenerationRequest(
            prompt=args.prompt,
            inputs=InputConfig(pil_image=condition),
            sampling=SamplingConfig(
                height=args.height,
                width=args.width,
                num_frames=1,
                fps=1,
                num_inference_steps=args.steps,
                guidance_scale=args.guidance_scale,
                seed=args.seed,
            ),
            output=OutputConfig(
                output_path=str(output.parent),
                save_video=False,
                return_frames=True,
            ),
        )
        result = generator.generate(request)
        if isinstance(result, list):
            result = result[0]

        frames = result.frames
        if frames is not None and len(frames):
            Image.fromarray(frames[0]).save(output)
            print(f"Saved image to {output}")
    finally:
        generator.shutdown()


if __name__ == "__main__":
    main()

lingbotworld_examples/00/intrinsics.npy

lingbotworld_examples/00/poses.npy

lingbotworld_examples/01/intrinsics.npy

lingbotworld_examples/01/poses.npy

lingbotworld_examples/02/intrinsics.npy

lingbotworld_examples/02/poses.npy