Source: examples/inference/optimizations

Optimization Examples¶

python examples/inference/optimizations/attention_example.py

Additional Files¶

FastWan_QAD_TAEHV.py

"""Fast NVFP4 linear inference for Wan2.1-T2V-1.3B with TAEHV decoding.

This is the FP4-linear fast path from ``fp4_linear_wan2_1_1_3b.py`` with the
heavy Wan VAE swapped out for TAEHV -- a tiny autoencoder that decodes Wan2.1
latents directly (no denormalization) and is dramatically faster / lighter.

How it works: the generator runs with ``output_type="latent"`` so the pipeline
returns raw denoised latents instead of pixels (the Wan VAE is offloaded and
never used). We then decode those latents with TAEHV in this script and save
the frames ourselves. This mirrors the FastVideo-Quantization
``quantization_example_taehv.py`` proof-of-concept, but kept clean: TAEHV is a
pip package (no ``sys.path`` hacks), the latent->uint8 conversion is vectorized,
and there is no dead profiler / sanitization code.

Requirements:
    - Blackwell GPU (B200/B300, sm100a/sm103a) for the FP4 linear path
    - flashinfer (``pip install flashinfer-python``)
    - TAEHV weights ``taew2_1.pth`` (https://github.com/madebyollin/taehv)

Usage:
    python fp4_linear_taehv_wan2_1_1_3b.py                    # FP4 + TAEHV + compile
    python fp4_linear_taehv_wan2_1_1_3b.py --no-taehv         # FP4 + full Wan VAE
    python fp4_linear_taehv_wan2_1_1_3b.py --no-compile       # eager
    python fp4_linear_taehv_wan2_1_1_3b.py --baseline         # dense bf16 reference
    python fp4_linear_taehv_wan2_1_1_3b.py --distilled_model ''  # base Wan2.1 weights
    python fp4_linear_taehv_wan2_1_1_3b.py --warmups 5 --benchmark-runs 20  # timing stats (default)
"""

import argparse
import contextlib
import logging
import os
import statistics
import time

import imageio
import torch

from fastvideo import VideoGenerator
from fastvideo.configs.pipelines.base import PipelineConfig
from fastvideo.layers.quantization.nvfp4_qat_config import NVFP4QATConfig

OUTPUT_PATH = "video_samples"

# Distilled, quantization-aware (QAD) transformer for Wan2.1-1.3B (3 steps,
# guidance 1.0). Loaded on top of the base Wan2.1 pipeline; pass
# ``--distilled_model ''`` to run the base weights instead.
DEFAULT_DISTILLED_MODEL = "FastVideo/FastWan-QAD-1.3B"
DISTILLED_WEIGHTS_FILE = (
    "generator_inference_transformer/diffusion_pytorch_model.safetensors"
)

# TAEHV checkpoint for Wan2.1. Clone https://github.com/madebyollin/taehv to get
# ``taew2_1.pth`` (Wan 2.1 / Wan 2.2-14B / Qwen-Image all use this VAE).
DEFAULT_TAEHV_CHECKPOINT = "/root/taehv/taew2_1.pth"

PROMPT = (
    "A curious raccoon peers through a vibrant field of yellow sunflowers, its eyes "
    "wide with interest. The playful yet serene atmosphere is complemented by soft "
    "natural light filtering through the petals. Mid-shot, warm and cheerful tones."
)


class TaehvDecoder:
    """Thin wrapper around the TAEHV tiny autoencoder for Wan2.1 latents.

    TAEHV consumes the *normalized* latents the diffusion model produces (the
    same representation FastVideo carries internally), so no denormalization is
    needed -- unlike the full Wan VAE path.
    """

    def __init__(self, checkpoint_path: str, device: str = "cuda",
                 dtype: torch.dtype = torch.float16) -> None:
        from taehv import TAEHV  # pip-installed; no sys.path manipulation
        self.device = device
        self.dtype = dtype
        print(f"Loading TAEHV from {checkpoint_path} ...")
        self.model = TAEHV(checkpoint_path=checkpoint_path).to(device, dtype).eval()

    @torch.no_grad()
    def decode(self, latents: torch.Tensor):
        """Decode FastVideo latents into uint8 RGB frames.

        Args:
            latents: ``[B, C, T, H, W]`` (NCTHW) normalized latent tensor.

        Returns:
            A ``(T, H, W, 3)`` uint8 numpy array ready for ``imageio.mimsave``.
        """
        # NCTHW -> NTCHW (TAEHV's expected layout), on the TAEHV device/dtype.
        latents = latents.permute(0, 2, 1, 3, 4).to(self.device, self.dtype)
        decoded = self.model.decode_video(
            latents, parallel=True, show_progress_bar=False)
        # decoded: [B, T, 3, H, W] in [0, 1]. Take batch 0, vectorize to uint8.
        frames = (decoded[0].clamp(0, 1) * 255).to(torch.uint8)
        return frames.permute(0, 2, 3, 1).cpu().numpy()


def resolve_distilled_weights(hf_id: str) -> str:
    """Return a local path to the distilled transformer safetensors."""
    if os.path.exists(hf_id):
        return hf_id
    from huggingface_hub import hf_hub_download
    return hf_hub_download(repo_id=hf_id, filename=DISTILLED_WEIGHTS_FILE)


@contextlib.contextmanager
def silence_request_log():
    """Quiet ``VideoGenerator.generate``'s per-request config printout.

    Each ``generate(...)`` call logs a multi-line debug block (height/width/
    prompt/steps/...) at INFO via ``logger.info`` in
    ``fastvideo.entrypoints.video_generator``. There is no built-in switch,
    so this context manager raises that logger's level to WARNING while the
    warmup calls run, then restores it for the timed run.
    """
    vg_logger = logging.getLogger("fastvideo.entrypoints.video_generator")
    prev_level = vg_logger.level
    vg_logger.setLevel(logging.WARNING)
    try:
        yield
    finally:
        vg_logger.setLevel(prev_level)


def resolve_taehv_checkpoint(path: str) -> str:
    """Validate the TAEHV checkpoint path, with a helpful error if missing."""
    if os.path.exists(path):
        return path
    raise FileNotFoundError(
        f"TAEHV checkpoint not found at {path!r}. Clone the weights with:\n"
        "    git clone https://github.com/madebyollin/taehv\n"
        "and pass --taehv_checkpoint <repo>/taew2_1.pth")


def build_generator(args: argparse.Namespace) -> VideoGenerator:
    model_id = args.model

    # Half precision everywhere; DiT linears are additionally NVFP4-quantized
    # via dit_config.quant_config below.
    pipeline_config = PipelineConfig.from_pretrained(model_id)
    pipeline_config.dit_precision = "bf16"
    pipeline_config.vae_precision = "bf16"
    pipeline_config.text_encoder_precisions = ("bf16",)

    if not args.baseline:
        pipeline_config.dit_config.quant_config = NVFP4QATConfig()

    compile_enabled = not args.no_compile

    extra_kwargs = {}
    if args.distilled_model:
        weights_path = resolve_distilled_weights(args.distilled_model)
        print(f"Using distilled weights: {args.distilled_model} -> {weights_path}")
        extra_kwargs["init_weights_from_safetensors"] = weights_path

    if args.taehv:
        # Skip the in-pipeline VAE decode entirely: the pipeline returns raw
        # latents, the Wan VAE is offloaded to CPU (and not compiled) since we
        # decode with TAEHV in this script instead.
        extra_kwargs["output_type"] = "latent"

    generator = VideoGenerator.from_pretrained(
        model_id,
        pipeline_config=pipeline_config,
        num_gpus=args.num_gpus,
        # Keep everything resident on the GPU -- no offloading, except the
        # unused Wan VAE when TAEHV handles decoding.
        use_fsdp_inference=False,
        dit_cpu_offload=False,
        dit_layerwise_offload=False,
        vae_cpu_offload=args.taehv,
        text_encoder_cpu_offload=False,
        pin_cpu_memory=False,
        enable_torch_compile=compile_enabled,
        enable_torch_compile_text_encoder=compile_enabled,
        enable_torch_compile_vae=compile_enabled and not args.taehv,
        **extra_kwargs,
    )
    return generator


def main() -> None:
    parser = argparse.ArgumentParser(
        description="FP4 linear Wan2.1-1.3B with TAEHV decoding benchmark")
    parser.add_argument("--model", default="Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
                        help="Model path or HuggingFace ID")
    parser.add_argument("--baseline", action="store_true",
                        help="Run dense bf16 instead of FP4 linear")
    parser.add_argument("--no-compile", action="store_true",
                        help="Disable torch.compile (eager)")
    parser.add_argument("--taehv", action=argparse.BooleanOptionalAction,
                        default=True,
                        help="Decode with TAEHV instead of the full Wan VAE "
                             "(use --no-taehv for the Wan VAE path)")
    parser.add_argument("--taehv_checkpoint", default=DEFAULT_TAEHV_CHECKPOINT,
                        help="Path to the TAEHV taew2_1.pth checkpoint")
    parser.add_argument("--distilled_model", default=DEFAULT_DISTILLED_MODEL,
                        help="HuggingFace ID (or local path) of a distilled "
                             "transformer checkpoint to load on top of --model. "
                             "Pass '' to use the base --model weights instead.")
    parser.add_argument("--num_gpus", type=int, default=1)
    parser.add_argument("--infer_steps", type=int, default=3)
    parser.add_argument("--guidance_scale", type=float, default=1.0)
    parser.add_argument("--warmups", type=int, default=5,
                        help="Warmup runs before timing (default: 5).")
    parser.add_argument("--benchmark-runs", type=int, default=20,
                        help="Timed runs to collect min/max/mean/std over (default: 20).")
    args = parser.parse_args()

    if not torch.cuda.is_available():
        raise SystemExit("CUDA is required for FP4 inference.")

    cap = torch.cuda.get_device_capability()
    print(f"GPU: {torch.cuda.get_device_name()} (capability {cap[0]}.{cap[1]})")
    if not args.baseline and cap[0] < 10:
        print("Warning: NVFP4 requires Blackwell (capability 10.0+); "
              "FP4 kernels may be unavailable on this GPU.")

    mode = "bf16" if args.baseline else "fp4_linear"
    mode += "_taehv" if args.taehv else "_wanvae"
    if not args.no_compile:
        mode += "_compile"
    print(f"Mode: {mode.upper()}")

    # Load TAEHV before the (slow) generator build so a bad checkpoint path
    # fails fast.
    taehv = TaehvDecoder(resolve_taehv_checkpoint(args.taehv_checkpoint)) \
        if args.taehv else None

    generator = build_generator(args)

    os.makedirs(OUTPUT_PATH, exist_ok=True)

    # Warmup: pay the DiT torch.compile cost and warm TAEHV's cuDNN algo
    # selection + allocator growth (~0.2s on the first decode) so the timed
    # runs below measure steady-state latency only.
    with silence_request_log():
        for _ in range(args.warmups):
            warm = generator.generate(request={
                "prompt": PROMPT,
                "sampling": {"num_inference_steps": 2, "guidance_scale": args.guidance_scale},
                "output": {"save_video": False, "return_frames": args.taehv},
            })
            if args.taehv:
                taehv.decode(warm.samples)

    output_path = os.path.join(OUTPUT_PATH, f"raccoon_{mode}.mp4")

    # Benchmark: time each run end-to-end. ``denoise`` is the generator's own
    # generation_time; for TAEHV we add the in-script decode. The mp4 is not
    # written inside the loop (only the final run's frames are saved below) so
    # disk I/O never pollutes the timings.
    denoise_times: list[float] = []
    decode_times: list[float] = []
    totals: list[float] = []
    frames = None
    with silence_request_log():
        for i in range(args.benchmark_runs):
            result = generator.generate(request={
                "prompt": PROMPT,
                "sampling": {
                    "num_inference_steps": args.infer_steps,
                    "guidance_scale": args.guidance_scale,
                },
                "output": {
                    "save_video": False,
                    "return_frames": args.taehv,
                    "output_path": output_path,
                },
            })
            denoise_elapsed = result.generation_time
            denoise_times.append(denoise_elapsed)

            if args.taehv:
                torch.cuda.synchronize()
                decode_start = time.perf_counter()
                frames = taehv.decode(result.samples)
                torch.cuda.synchronize()
                decode_elapsed = time.perf_counter() - decode_start
                decode_times.append(decode_elapsed)
                total = denoise_elapsed + decode_elapsed
            else:
                total = denoise_elapsed
            totals.append(total)

            line = f"  run {i + 1:02d}/{args.benchmark_runs}: {total:.3f}s"
            if args.taehv:
                line += (f" (denoise {denoise_elapsed:.3f}s + "
                         f"decode {decode_elapsed:.3f}s)")
            print(line)

    if args.taehv and frames is not None:
        imageio.mimsave(output_path, frames, fps=16, format="mp4")
        print(f"Saved video to {output_path}")

    # Report min / max / mean / std over the timed runs.
    def _stat_row(name: str, xs: list[float]) -> str:
        std = statistics.stdev(xs) if len(xs) > 1 else 0.0
        return (f"  {name:<11}{min(xs):>8.3f}{max(xs):>9.3f}"
                f"{statistics.mean(xs):>9.3f}{std:>9.3f}")

    if totals:
        print(f"\n[{mode.upper()}] {args.benchmark_runs} runs, {args.warmups} warmup, "
              f"{args.infer_steps} steps:")
        print(f"  {'metric':<11}{'min':>8}{'max':>9}{'mean':>9}{'std':>9}   (s)")
        print(_stat_row("total", totals))
        if args.taehv:
            print(_stat_row("denoise", denoise_times))
            print(_stat_row("decode", decode_times))

    generator.shutdown()


if __name__ == "__main__":
    main()

attention_example.py

import os
import time

from fastvideo import VideoGenerator

def main():
    # set the attention backend 
    os.environ["FASTVIDEO_ATTENTION_BACKEND"] = "FLASH_ATTN"

    start_time = time.perf_counter()
    gen = VideoGenerator.from_pretrained(
        model_path="Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
        num_gpus=1,
        dit_cpu_offload=False,
        vae_cpu_offload=False,
        text_encoder_cpu_offload=True,
        pin_cpu_memory=True, # set to false if low CPU RAM or hit obscure "CUDA error: Invalid argument"
    )
    load_time = time.perf_counter() - start_time
    print(f"Model loading time: {load_time:.2f} seconds")

    gen_start_time = time.perf_counter()

    gen.generate_video(
        prompt=
        "Will Smith casually eats noodles, his relaxed demeanor contrasting with the energetic background of a bustling street food market. The scene captures a mix of humor and authenticity. Mid-shot framing, vibrant lighting.",
        seed=1024,
        output_path="example_outputs/")

    generation_time = time.perf_counter() - gen_start_time
    print(f"Video generation time: {generation_time:.2f} seconds")

    total_time = time.perf_counter() - start_time
    print(f"Total execution time: {total_time:.2f} seconds")

if __name__ == "__main__":
    main()

fp4_attn_wan2_1_1_3b.py

"""FP4 Flash Attention 4 inference example on Blackwell GPUs.

Quantizes Q and K to NVFP4 E2M1 with per-block E4M3 scale factors,
achieving up to 1.39x attention kernel speedup over BF16 FA4.

Requirements:
    - Blackwell GPU (B200/B300, sm100a/sm103a)
    - flash-attention-fp4, cutlass-dsl, flashinfer
    - See docs/inference/optimizations.md for installation

Usage:
    python fp4_attn_wan2_1_1_3b.py --nvfp4_fa4
    python fp4_attn_wan2_1_1_3b.py  # BF16 baseline
"""

import argparse
import os
import time

from fastvideo import VideoGenerator

OUTPUT_PATH = "video_samples"


def main():
    parser = argparse.ArgumentParser(description="FP4 FA4 video generation benchmark")
    parser.add_argument("--nvfp4_fa4", action="store_true",
                        help="Enable NVFP4 FP4 quantized QK flash attention")
    parser.add_argument("--model", default="Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
                        help="Model path or HuggingFace ID")
    parser.add_argument("--compile", action="store_true",
                        help="Enable torch.compile for DIT")
    parser.add_argument("--num_gpus", type=int, default=1)
    parser.add_argument("--infer_steps", type=int, default=50)
    args = parser.parse_args()

    mode = "nvfp4" if args.nvfp4_fa4 else "bf16"
    if args.compile:
        mode += "_compile"
    print(f"Mode: {mode.upper()}")

    generator = VideoGenerator.from_pretrained(
        args.model,
        num_gpus=args.num_gpus,
        nvfp4_fa4=args.nvfp4_fa4,
        use_fsdp_inference=not args.nvfp4_fa4,
        dit_cpu_offload=False,
        dit_layerwise_offload=False,
        vae_cpu_offload=True,
        text_encoder_cpu_offload=True,
        enable_torch_compile=args.compile,
    )

    prompt = (
        "A curious raccoon peers through a vibrant field of yellow sunflowers, its eyes "
        "wide with interest. The playful yet serene atmosphere is complemented by soft "
        "natural light filtering through the petals. Mid-shot, warm and cheerful tones."
    )

    n_warmup = 2 if args.compile else 1
    for i in range(n_warmup):
        generator.generate(request={"prompt": prompt, "sampling": {"num_inference_steps": 2},
                                    "output": {"save_video": False}})

    os.makedirs(OUTPUT_PATH, exist_ok=True)
    start = time.time()
    generator.generate(request={
        "prompt": prompt,
        "sampling": {"num_inference_steps": args.infer_steps},
        "output": {"save_video": True, "output_path": os.path.join(OUTPUT_PATH, f"raccoon_{mode}.mp4")},
    })
    elapsed = time.time() - start
    print(f"[{mode.upper()}] {args.infer_steps} steps in {elapsed:.2f}s "
          f"({args.infer_steps / elapsed:.2f} it/s)")

    generator.shutdown()


if __name__ == "__main__":
    main()

fp8_wan2_1_1_3b.py

"""FP8 weight quantization inference example.

Runs Wan2.1-T2V-1.3B with FP8 e4m3 quantized DiT linear layers (attention
projections and FFN). Weights are quantized in-place after loading; activations
are quantized dynamically at runtime. Reduces GPU memory relative to BF16 and
can improve throughput on sm89+ GPUs.

Requirements:
    - GPU: sm89+ (H100, L40S, RTX 4090, Ada Lovelace, or newer)
      Falls back to a bf16 dequant path on older GPUs.
    - TAEHV (optional): Follow install instructions at https://github.com/madebyollin/taehv

Usage:
    python fp8_wan2_1_1_3b.py              # FP8 per-tensor (default)
    python fp8_wan2_1_1_3b.py --bf16       # BF16 baseline
    python fp8_wan2_1_1_3b.py --granularity channel  # per-channel (higher accuracy but slower)
    python fp8_wan2_1_1_3b.py --taehv-checkpoint /path/to/taew2_1.pth
"""

import argparse
import os
import sys
import time

import torch

OUTPUT_PATH = "video_samples"


def load_taehv(checkpoint_path, device="cuda", dtype=torch.float16):
    repo_dir = os.path.dirname(checkpoint_path)
    if repo_dir not in sys.path:
        sys.path.insert(0, repo_dir)
    from taehv import TAEHV
    print(f"Loading TAEHV from {checkpoint_path}...")
    model = TAEHV(checkpoint_path=checkpoint_path).to(device, dtype)
    print("TAEHV loaded.")
    return model


@torch.no_grad()  # type: ignore[misc]
def decode_with_taehv(taehv_model, latents):
    latents = latents.permute(0, 2, 1, 3, 4)
    latents = latents.to(device=next(taehv_model.parameters()).device,
                         dtype=next(taehv_model.parameters()).dtype)
    decoded = taehv_model.decode_video(latents, parallel=False, show_progress_bar=False)
    frames = []
    for frame in decoded[0]:
        frame_np = (frame.clamp(0, 1) * 255).byte().cpu().permute(1, 2, 0).numpy()
        frames.append(frame_np)
    return frames


def main():
    parser = argparse.ArgumentParser(description="FP8 video generation benchmark")
    parser.add_argument("--bf16", action="store_true",
                        help="BF16 baseline (no FP8 quantization)")
    parser.add_argument("--granularity", choices=["tensor", "channel"], default="tensor",
                        help="FP8 weight scale granularity: tensor (faster) or channel (more accurate)")
    parser.add_argument("--taehv-checkpoint", default=None, metavar="PATH",
                        help="Path to taew2_1.pth; enables TAEHV tiny autoencoder decoding")
    parser.add_argument("--model", default="FastVideo/FastWan-QAD-FP8-1.3B",
                        help="Model path or HuggingFace ID")
    parser.add_argument("--no-compile", action="store_true", help="Disable torch.compile for the DiT")
    parser.add_argument("--num_gpus", type=int, default=1)
    parser.add_argument("--infer_steps", type=int, default=3)
    args = parser.parse_args()

    os.environ.setdefault("FASTVIDEO_ATTENTION_BACKEND", "SAGE_ATTN")

    from fastvideo import VideoGenerator
    from fastvideo.layers.quantization import get_quantization_config

    mode = "bf16" if args.bf16 else f"fp8_{args.granularity}"
    if not args.no_compile:
        mode += "_compile"
    use_taehv = args.taehv_checkpoint is not None
    print(f"Mode: {mode.upper()}" + ("  decoder=TAEHV" if use_taehv else "  decoder=VAE"))

    taehv_model = load_taehv(args.taehv_checkpoint) if use_taehv else None

    # transformer_quant needs a QuantizationConfig *instance* — the bare string
    # is not resolved on the from_pretrained kwarg path.
    extra = {} if args.bf16 else {
        "transformer_quant": get_quantization_config("FP8")(granularity=args.granularity)
    }
    generator = VideoGenerator.from_pretrained(
        args.model,
        num_gpus=args.num_gpus,
        use_fsdp_inference=False,
        dit_cpu_offload=False,
        dit_layerwise_offload=False,
        vae_cpu_offload=use_taehv,
        text_encoder_cpu_offload=False,
        pin_cpu_memory=False,
        enable_torch_compile=not args.no_compile,
        enable_torch_compile_vae=not args.no_compile and not use_taehv,
        output_type="latent" if use_taehv else "pil",
        **extra,
    )

    prompt = (
        "A curious raccoon peers through a vibrant field of yellow sunflowers, its eyes "
        "wide with interest. The playful yet serene atmosphere is complemented by soft "
        "natural light filtering through the petals. Mid-shot, warm and cheerful tones."
    )

    n_warmup = 1 if not args.no_compile else 0
    for _ in range(n_warmup):
        generator.generate(request={"prompt": prompt, "sampling": {"num_inference_steps": 3, "guidance_scale": 1.0},
                                    "output": {"save_video": False}})

    os.makedirs(OUTPUT_PATH, exist_ok=True)
    start = time.time()
    if use_taehv:
        result = generator.generate(request={
            "prompt": prompt,
            "sampling": {"num_inference_steps": args.infer_steps, "guidance_scale": 1.0},
            "output": {"save_video": False},
        })
        import imageio
        frames = decode_with_taehv(taehv_model, result.samples)
        video_path = os.path.join(OUTPUT_PATH, f"raccoon_{mode}.mp4")
        imageio.mimsave(video_path, frames, fps=16, format="mp4")
        print(f"Saved TAEHV-decoded video to: {video_path}")
    else:
        generator.generate(request={
            "prompt": prompt,
            "sampling": {"num_inference_steps": args.infer_steps, "guidance_scale": 1.0},
            "output": {"save_video": True, "output_path": os.path.join(OUTPUT_PATH, f"raccoon_{mode}.mp4")},
        })
    elapsed = time.time() - start
    print(f"[{mode.upper()}] {args.infer_steps} steps in {elapsed:.2f}s "
          f"({args.infer_steps / elapsed:.2f} it/s)")

    generator.shutdown()


if __name__ == "__main__":
    main()

nvfp4_qat_wan2_1_1_3b.py

"""NVFP4 + Attn-QAT (modified SageAttention3) inference on Blackwell.

Runs Wan2.1-T2V-1.3B fully in 4-bit: NVFP4 linear layers (activations
quantized on the fly) together with the modified SageAttention3 FP4 attention
backend (``ATTN_QAT_INFER``). This is the inference half of the
Quantization-Aware Distillation (QAD) recipe.

Requirements:
    - RTX 5090 / consumer Blackwell (sm_120a). The attn_qat_infer kernel hard
      gates on sm_120; on other GPUs it falls back to Flash Attention.
    - The attn_qat_infer kernel built into fastvideo-kernel (see #1455) and
      flashinfer for the NVFP4 linear matmuls.

Usage:
    python nvfp4_qat_wan2_1_1_3b.py                 # NVFP4 linear + Attn-QAT attn
    python nvfp4_qat_wan2_1_1_3b.py --bf16          # BF16 baseline
"""

import argparse
import os
import time

OUTPUT_PATH = "video_samples"


def main():
    parser = argparse.ArgumentParser(description="NVFP4 + Attn-QAT video generation")
    parser.add_argument("--bf16", action="store_true",
                        help="BF16 baseline (no NVFP4 linear, default attention)")
    parser.add_argument("--model", default="Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
                        help="Model path or HuggingFace ID")
    parser.add_argument("--quant-method", default="nvfp4_qat", choices=["nvfp4_qat", "NVFP4"],
                        help="Linear quantization config. Wan-2.1 uses nvfp4_qat (matches its "
                             "to_q/k/v/out + ffn layers); NVFP4 is LTX2-specific and will NOT "
                             "quantize Wan.")
    parser.add_argument("--compile", action="store_true", help="Enable torch.compile for the DiT")
    parser.add_argument("--num_gpus", type=int, default=1)
    parser.add_argument("--infer_steps", type=int, default=50)
    args = parser.parse_args()

    # The attention backend is selected via env var before the engine starts.
    # ATTN_QAT_INFER -> AttnQatInferBackend (modified SageAttention3 FP4).
    if not args.bf16:
        os.environ["FASTVIDEO_ATTENTION_BACKEND"] = "ATTN_QAT_INFER"

    # Import after the env var so the platform picks up the selection.
    from fastvideo import VideoGenerator
    from fastvideo.layers.quantization import get_quantization_config

    mode = "bf16" if args.bf16 else args.quant_method
    if args.compile:
        mode += "_compile"
    print(f"Mode: {mode.upper()}")

    # transformer_quant needs a QuantizationConfig *instance* — the bare string
    # is not resolved on the from_pretrained kwarg path.
    extra = {} if args.bf16 else {"transformer_quant": get_quantization_config(args.quant_method)()}
    generator = VideoGenerator.from_pretrained(
        args.model,
        num_gpus=args.num_gpus,
        use_fsdp_inference=args.bf16,
        dit_cpu_offload=False,
        vae_cpu_offload=True,
        text_encoder_cpu_offload=True,
        enable_torch_compile=args.compile,
        **extra,
    )

    prompt = (
        "A curious raccoon peers through a vibrant field of yellow sunflowers, its eyes "
        "wide with interest. The playful yet serene atmosphere is complemented by soft "
        "natural light filtering through the petals. Mid-shot, warm and cheerful tones."
    )

    n_warmup = 2 if args.compile else 1
    for _ in range(n_warmup):
        generator.generate(request={"prompt": prompt, "sampling": {"num_inference_steps": 2},
                                    "output": {"save_video": False}})

    os.makedirs(OUTPUT_PATH, exist_ok=True)
    start = time.time()
    generator.generate(request={
        "prompt": prompt,
        "sampling": {"num_inference_steps": args.infer_steps},
        "output": {"save_video": True, "output_path": os.path.join(OUTPUT_PATH, f"raccoon_{mode}.mp4")},
    })
    elapsed = time.time() - start
    print(f"[{mode.upper()}] {args.infer_steps} steps in {elapsed:.2f}s "
          f"({args.infer_steps / elapsed:.2f} it/s)")

    generator.shutdown()


if __name__ == "__main__":
    main()

text_encoder_quant_example.py

from fastvideo import VideoGenerator
import argparse

OUTPUT_PATH = "video_samples_wan2_2_5B_ti2v"


def main(text_encoder_path: str):
    # FastVideo will automatically use the optimal default arguments for the
    # model.
    # If a local path is provided, FastVideo will make a best effort
    # attempt to identify the optimal arguments.
    model_name = "Wan-AI/Wan2.2-TI2V-5B-Diffusers"
    generator = VideoGenerator.from_pretrained(
        model_name,
        # FastVideo will automatically handle distributed setup
        num_gpus=1,
        use_fsdp_inference=True,
        dit_cpu_offload=True,
        vae_cpu_offload=False,
        text_encoder_cpu_offload=False,
        # AbsMaxFP8 is the quantization method used by ComfyUI; 
        # check fastvideo/layers/quantization/* for more quantization methods
        override_text_encoder_quant="AbsMaxFP8",
        # for Wan 2.2, this is the path to "umt5_xxl_fp8_e4m3fn_scaled.safetensors"
        override_text_encoder_safetensors=text_encoder_path,
        pin_cpu_memory=True,  # set to false if low CPU RAM or hit obscure "CUDA error: Invalid argument"
    )

    # I2V is triggered just by passing in an image_path argument
    prompt = "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside."
    image_path = "https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/wan_i2v_input.JPG"
    video = generator.generate_video(
        prompt, output_path=OUTPUT_PATH, save_video=True, image_path=image_path
    )


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--text_encoder_path",
        type=str,
        required=True,
        help="Path to the quantized text encoder safetensors file.",
    )
    args = parser.parse_args()
    main(args.text_encoder_path)

torch_compile_example.py

"""torch.compile A/B example for FastVideo.

`enable_torch_compile=True` compiles the DiT submodules that declare
`_compile_conditions` for a substantial end-to-end speedup (e.g.
Wan2.1-T2V-1.3B on A100: ~-24% e2e). It is off by default.

The first compiled generation pays a one-time graph-build cost; it
amortizes over later generations with the same input shapes. This
script does one un-measured warmup then a measured run so the reported
number is steady-state, not graph-build — measuring the warmup is the
most common way to wrongly conclude compile is slower.

Usage:
    # baseline (eager)
    python torch_compile_example.py
    # compiled
    python torch_compile_example.py --compile
"""

import argparse
import os
import time

from fastvideo import VideoGenerator

PROMPT = (
    "A high-definition video of a robotic arm welding a metal structure, "
    "bright sparks and smoke, industrial setting."
)


def main() -> None:
    parser = argparse.ArgumentParser(description="torch.compile A/B")
    parser.add_argument("--model", default="Wan-AI/Wan2.1-T2V-1.3B-Diffusers")
    parser.add_argument("--compile", action="store_true",
                        help="Enable torch.compile for the DiT")
    parser.add_argument("--num_gpus", type=int, default=1)
    args = parser.parse_args()

    mode = "COMPILE" if args.compile else "BASELINE"
    print(f"Mode: {mode}  (enable_torch_compile={args.compile})")

    os.makedirs("video_samples", exist_ok=True)

    generator = VideoGenerator.from_pretrained(
        args.model,
        num_gpus=args.num_gpus,
        enable_torch_compile=args.compile,
    )

    def _run(tag: str) -> float:
        save = tag == "measured"
        # Modern typed-request API (generate_video is deprecated). Same
        # prompt/seed/shapes both runs so the compiled graph is reused.
        request: dict = {
            "prompt": PROMPT,
            "sampling": {"seed": 1024},
            "output": {"save_video": save},
        }
        if save:
            request["output"]["output_path"] = (
                f"video_samples/torch_compile_{tag}.mp4")
        t0 = time.perf_counter()
        generator.generate(request)
        return time.perf_counter() - t0

    try:
        # Warmup: pays the one-time graph build when --compile. Discarded.
        w = _run("warmup")
        print(f"warmup: {w:.2f}s "
              f"({'incl. graph build' if args.compile else 'cold start'})")

        # Measured: steady state, compiled graph reused (same shapes/seed).
        m = _run("measured")
        print(f"=== {mode} steady-state e2e: {m:.2f}s ===")
    finally:
        generator.shutdown()


if __name__ == "__main__":
    main()