Skip to content

benchmarks

Modules

fastvideo.models.loader.benchmarks.benchmark_weight_loading

Benchmark for model weight loading speed.

Measures the time to load model weights from safetensors files using different strategies (CPU vs GPU, broadcast vs independent).

Usage (single GPU): python fastvideo/models/loader/benchmarks/benchmark_weight_loading.py --model-path /path/to/model

Usage (multi-GPU, e.g. 4 GPUs): torchrun --nproc_per_node=4 fastvideo/models/loader/benchmarks/benchmark_weight_loading.py --model-path /path/to/model

Functions

fastvideo.models.loader.benchmarks.benchmark_weight_loading.benchmark_loading
benchmark_loading(files: list[str], to_cpu: bool, broadcast: bool, warmup: int, repeats: int, label: str) -> None

Run the weight loading benchmark and print results.

Source code in fastvideo/models/loader/benchmarks/benchmark_weight_loading.py
def benchmark_loading(
    files: list[str],
    to_cpu: bool,
    broadcast: bool,
    warmup: int,
    repeats: int,
    label: str,
) -> None:
    """Run the weight loading benchmark and print results."""
    rank = dist.get_rank() if dist.is_initialized() else 0
    node_group = get_node_group()

    # Count total params and bytes on first pass
    total_params = 0
    total_bytes = 0
    for name, tensor in safetensors_weights_iterator(
            files, to_cpu=to_cpu, broadcast=broadcast):
        total_params += 1
        total_bytes += tensor.nelement() * tensor.element_size()

    if rank == 0:
        logger.info("[%s] %d tensors, %.2f GB total",
                    label, total_params, total_bytes / 1e9)

    # Warmup
    for _ in range(warmup):
        for _ in safetensors_weights_iterator(
                files, to_cpu=to_cpu, broadcast=broadcast):
            pass
        if dist.is_initialized():
            dist.barrier()

    # Timed runs
    times = []
    for i in range(repeats):
        if dist.is_initialized():
            dist.barrier()
        torch.cuda.synchronize() if torch.cuda.is_available() else None

        t0 = time.perf_counter()
        for _ in safetensors_weights_iterator(
                files, to_cpu=to_cpu, broadcast=broadcast):
            pass
        torch.cuda.synchronize() if torch.cuda.is_available() else None

        if dist.is_initialized():
            dist.barrier()
        elapsed = time.perf_counter() - t0
        times.append(elapsed)

    if rank == 0:
        avg = sum(times) / len(times)
        best = min(times)
        throughput = total_bytes / avg / 1e9
        logger.info(
            "[%s] avg %.3fs | best %.3fs | throughput %.2f GB/s "
            "(over %d runs, %d warmup, %d GPU(s), node_size=%d)",
            label, avg, best, throughput,
            repeats, warmup, node_group.world_size, node_group.world_size,
        )

fastvideo.models.loader.benchmarks.benchmark_weight_loading_comparison

A/B benchmark: independent-read vs rank-0-broadcast weight loading.

Compares two strategies
  • "before" (independent): every rank reads safetensors from disk to GPU
  • "after" (broadcast): rank 0 reads from disk, broadcasts to other ranks
Usage

1 GPU

torchrun --nproc_per_node=1 fastvideo/models/loader/benchmarks/benchmark_weight_loading_comparison.py --model-path /path/to/model --subfolder transformer

2 GPUs

torchrun --nproc_per_node=2 fastvideo/models/loader/benchmarks/benchmark_weight_loading_comparison.py --model-path /path/to/model --subfolder transformer

4 GPUs

torchrun --nproc_per_node=4 fastvideo/models/loader/benchmarks/benchmark_weight_loading_comparison.py --model-path /path/to/model --subfolder transformer

Functions

fastvideo.models.loader.benchmarks.benchmark_weight_loading_comparison.load_broadcast
load_broadcast(files: list[str], device: str, node_group, async_op: bool = False)

After-PR behavior: rank 0 reads from disk, broadcasts to other ranks.

Source code in fastvideo/models/loader/benchmarks/benchmark_weight_loading_comparison.py
def load_broadcast(files: list[str], device: str, node_group,
                   async_op: bool = False):
    """After-PR behavior: rank 0 reads from disk, broadcasts to other ranks."""
    local_rank = node_group.local_rank
    handles = []
    for st_file in files:
        with safe_open(st_file, framework="pt", device=device) as f:
            for name in f:
                if local_rank == 0:
                    param = f.get_tensor(name)
                else:
                    sl = f.get_slice(name)
                    shape = sl.get_shape()
                    dtype = SAFETENSORS_TO_TORCH_DTYPE[sl.get_dtype()]
                    param = torch.empty(shape, device=device, dtype=dtype)
                if node_group.world_size > 1:
                    group = node_group.device_group
                    if async_op:
                        handle = dist.broadcast(
                            param,
                            src=dist.get_global_rank(group, 0),
                            async_op=True,
                            group=group,
                        )
                        handles.append(handle)
                    else:
                        dist.broadcast(
                            param,
                            src=dist.get_global_rank(group, 0),
                            group=group,
                        )
                yield name, param
        if async_op:
            for handle in handles:
                handle.wait()
            handles.clear()
fastvideo.models.loader.benchmarks.benchmark_weight_loading_comparison.load_independent
load_independent(files: list[str], device: str)

Before-PR behavior: every rank reads every tensor from disk to GPU.

Source code in fastvideo/models/loader/benchmarks/benchmark_weight_loading_comparison.py
def load_independent(files: list[str], device: str):
    """Before-PR behavior: every rank reads every tensor from disk to GPU."""
    for st_file in files:
        with safe_open(st_file, framework="pt", device=device) as f:
            for name in f:
                param = f.get_tensor(name)
                yield name, param