Skip to content

io

Classes

fastvideo.eval.io.NoAudioStreamError

Bases: ValueError

Raised when a video file has no audio stream to extract.

Functions

fastvideo.eval.io.as_video

as_video(x: str | Path | Tensor | Video) -> Video

Coerce path/tensor/Video → :class:Video for the pool to decode.

Path strings and :class:pathlib.Path become Video(source=str(x)); the pool then calls :func:load_video on first use. Tensors become Video(source=None, frames=x) — the pool sees .frames already populated and forwards untouched. :class:Video instances pass through.

Source code in fastvideo/eval/io/inputs.py
def as_video(x: str | Path | torch.Tensor | Video) -> Video:
    """Coerce path/tensor/Video → :class:`Video` for the pool to decode.

    Path strings and :class:`pathlib.Path` become ``Video(source=str(x))``;
    the pool then calls :func:`load_video` on first use.  Tensors become
    ``Video(source=None, frames=x)`` — the pool sees ``.frames`` already
    populated and forwards untouched.  :class:`Video` instances pass through.
    """
    if isinstance(x, Video):
        return x
    if isinstance(x, str | Path):
        return Video(source=str(x))
    if isinstance(x, torch.Tensor):
        return Video(source=None, frames=x)
    raise TypeError(f"Cannot coerce {type(x).__name__} to Video")

fastvideo.eval.io.build_eval_kwargs

build_eval_kwargs(row: dict, video_path: Path, *, fps: float = 24.0) -> dict[str, Any]

Build evaluator kwargs from a sample row + a video on disk.

Loads the video as (T,C,H,W) and adds the leading batch dim. Forwards prompt (as scalar text_prompt) and auxiliary_info (as scalar dict) when present on the row — matches the one-sample-per-call contract that the evaluator and every metric assume.

Source code in fastvideo/eval/io/paths.py
def build_eval_kwargs(row: dict, video_path: Path, *, fps: float = 24.0) -> dict[str, Any]:
    """Build evaluator kwargs from a sample row + a video on disk.

    Loads the video as ``(T,C,H,W)`` and adds the leading batch dim.
    Forwards ``prompt`` (as scalar ``text_prompt``) and
    ``auxiliary_info`` (as scalar dict) when present on the row —
    matches the one-sample-per-call contract that the evaluator and
    every metric assume.
    """
    from fastvideo.eval.io.video import load_video

    video = load_video(str(video_path))  # (T, C, H, W) in [0, 1]
    kwargs: dict[str, Any] = {
        "video": video.unsqueeze(0),  # (1, T, C, H, W)
        "fps": fps,
    }
    if "prompt" in row:
        kwargs["text_prompt"] = row["prompt"]
    aux = row.get("auxiliary_info")
    if aux:
        kwargs["auxiliary_info"] = aux
    return kwargs

fastvideo.eval.io.default_filename

default_filename(row: dict, idx: int, ext: str = '.mp4') -> str

<sanitized-prompt>-<idx>.mp4 — VBench-style.

Source code in fastvideo/eval/io/paths.py
def default_filename(row: dict, idx: int, ext: str = ".mp4") -> str:
    """``<sanitized-prompt>-<idx>.mp4`` — VBench-style."""
    return f"{sanitize_prompt(row['prompt'])}-{idx}{ext}"

fastvideo.eval.io.extract_audio_track

extract_audio_track(video_path: str | Path, *, output_dir: str | Path, sample_rate: int | None = None, codec: str = 'pcm_s16le') -> Path

Pull the first audio stream from video_path to a .wav in output_dir.

Idempotent — if output_dir / {stem}.wav already exists, the existing file is returned without re-decoding. This makes the helper safe to call repeatedly from parallel workers on overlapping inputs (the cache hit is the fast path).

Parameters

video_path : Source video file (anything PyAV can open: mp4, mkv, webm, …). output_dir : Directory the .wav lands in. Created if it doesn't exist. sample_rate : If set, resample to this rate. Default (None) preserves the source rate — most audio metrics resample again internally to their own target rate, so adding a resample step here would just be wasted work. codec : PCM codec for the output. pcm_s16le (default) gives 16-bit little-endian PCM, the most broadly compatible WAV format.

Returns

Path Absolute path to the extracted .wav.

Raises

NoAudioStreamError If video_path has no audio stream.

Source code in fastvideo/eval/io/audio.py
def extract_audio_track(
    video_path: str | Path,
    *,
    output_dir: str | Path,
    sample_rate: int | None = None,
    codec: str = "pcm_s16le",
) -> Path:
    """Pull the first audio stream from *video_path* to a ``.wav`` in *output_dir*.

    Idempotent — if ``output_dir / {stem}.wav`` already exists, the
    existing file is returned without re-decoding.  This makes the
    helper safe to call repeatedly from parallel workers on overlapping
    inputs (the cache hit is the fast path).

    Parameters
    ----------
    video_path :
        Source video file (anything PyAV can open: mp4, mkv, webm, …).
    output_dir :
        Directory the ``.wav`` lands in.  Created if it doesn't exist.
    sample_rate :
        If set, resample to this rate.  Default (``None``) preserves the
        source rate — most audio metrics resample again internally to
        their own target rate, so adding a resample step here would just
        be wasted work.
    codec :
        PCM codec for the output.  ``pcm_s16le`` (default) gives 16-bit
        little-endian PCM, the most broadly compatible WAV format.

    Returns
    -------
    Path
        Absolute path to the extracted ``.wav``.

    Raises
    ------
    NoAudioStreamError
        If *video_path* has no audio stream.
    """
    import av

    in_path = Path(video_path)
    out_dir = Path(output_dir)
    out_dir.mkdir(parents=True, exist_ok=True)
    out_path = (out_dir / f"{in_path.stem}.wav").resolve()
    if out_path.exists():
        return out_path

    container = av.open(str(in_path))
    try:
        audio_streams = [s for s in container.streams if s.type == "audio"]
        if not audio_streams:
            raise NoAudioStreamError(f"No audio stream in {in_path}")
        in_stream = audio_streams[0]
        target_rate = sample_rate or in_stream.rate

        out = av.open(str(out_path), "w", format="wav")
        try:
            out_stream = out.add_stream(codec, rate=target_rate)
            # Match input layout (mono / stereo / surround) — audio metrics
            # that want mono will downmix themselves.
            if in_stream.layout is not None:
                out_stream.layout = in_stream.layout
            resampler = None
            if sample_rate is not None and sample_rate != in_stream.rate:
                resampler = av.AudioResampler(format=out_stream.format, layout=out_stream.layout, rate=target_rate)
            for frame in container.decode(in_stream):
                frames_out = resampler.resample(frame) if resampler is not None else [frame]
                for f in frames_out:
                    # PyAV needs pts cleared so the encoder reassigns.
                    f.pts = None
                    for packet in out_stream.encode(f):
                        out.mux(packet)
            for packet in out_stream.encode():  # flush
                out.mux(packet)
        finally:
            out.close()
    finally:
        container.close()
    return out_path

fastvideo.eval.io.extract_frames

extract_frames(video: Tensor, n_frames: int | None = None) -> Tensor

Uniformly sample n_frames from a (T, C, H, W) video tensor.

Source code in fastvideo/eval/io/video.py
def extract_frames(video: torch.Tensor, n_frames: int | None = None) -> torch.Tensor:
    """Uniformly sample *n_frames* from a ``(T, C, H, W)`` video tensor."""
    if n_frames is None or n_frames >= video.shape[0]:
        return video
    indices = torch.linspace(0, video.shape[0] - 1, n_frames).long()
    return video[indices]

fastvideo.eval.io.glob_videos

glob_videos(videos_dir: Path, row: dict, ext: str = '.mp4') -> list[Path]

Find every generated video for row, sorted by trailing -<idx>.

Source code in fastvideo/eval/io/paths.py
def glob_videos(videos_dir: Path, row: dict, ext: str = ".mp4") -> list[Path]:
    """Find every generated video for *row*, sorted by trailing ``-<idx>``."""
    pattern = f"{sanitize_prompt(row['prompt'])}-*{ext}"
    files = list(videos_dir.glob(pattern))

    def _idx(p: Path) -> int:
        try:
            return int(p.stem.rsplit("-", 1)[1])
        except (IndexError, ValueError):
            return -1

    return sorted(files, key=_idx)

fastvideo.eval.io.load_video

load_video(source: str | Tensor | list, **kwargs) -> Tensor

Load a video as a (T, C, H, W) float32 tensor in [0, 1].

Supported source types:

  • str / Path – path to .mp4 / .avi / .gif file, or a directory of frame images (sorted alphabetically).
  • torch.Tensor – returned as-is after shape validation.
  • list[PIL.Image] – stacked into a tensor.
Source code in fastvideo/eval/io/video.py
def load_video(source: str | torch.Tensor | list, **kwargs) -> torch.Tensor:
    """Load a video as a ``(T, C, H, W)`` float32 tensor in ``[0, 1]``.

    Supported *source* types:

    * **str / Path** – path to ``.mp4`` / ``.avi`` / ``.gif`` file, or a
      directory of frame images (sorted alphabetically).
    * **torch.Tensor** – returned as-is after shape validation.
    * **list[PIL.Image]** – stacked into a tensor.
    """
    if isinstance(source, torch.Tensor):
        if source.ndim != 4:
            raise ValueError(f"Expected video tensor with 4 dims (T,C,H,W), got {source.ndim}")
        return source.float()

    if isinstance(source, list):
        frames = [_pil_to_tensor(img) for img in source]
        return torch.stack(frames)

    path = Path(source)
    if path.is_dir():
        return _load_frame_dir(path)
    return _load_video_file(str(path))

fastvideo.eval.io.samples_from

samples_from(*, video: PathSpec | None = None, reference: PathSpec | None = None, audio: PathSpec | None = None, reference_audio: PathSpec | None = None, text_prompt: str | None = None, text_prompts: str | Path | list[str] | None = None, fps: float | None = None, auxiliary_info: dict | list[dict] | None = None, extras: dict | list[dict] | None = None, extract_audio: bool | str | Path = False, extract_workers: int = 4) -> list[dict]

Build a samples list from path-style inputs.

Parameters

video, reference, audio, reference_audio : File path, directory of files (sorted by name), or any iterable of paths. Pass whichever modalities apply to the metrics you plan to run — they attach to sample["video"] / sample["reference"] / sample["audio"] / sample["reference_audio"] respectively. Video paths are wrapped in :class:Video so :class:VideoPool decodes them lazily in parallel; audio paths stay as strings (audio metrics each load with their own resample / preprocess). text_prompt : A single prompt string broadcast onto every sample. text_prompts : A list of strings (one per sample), or a path to a .jsonl / .json file containing per-sample prompts. fps : Scalar fps broadcast onto every sample. auxiliary_info : Single dict (broadcast) or list of dicts (zipped) for sample["auxiliary_info"] — vbench structured-prompt metrics read this. extras : Catch-all per-sample attachments. Use for metric-specific keys the dedicated kwargs don't cover (scenario, view, actions, calibration, reference_take2, ...). Pass a single dict to broadcast or a list-of-dicts to zip; the keys merge into each sample dict. extract_audio : If truthy, auto-extract audio from each video / reference source into .wav files via PyAV and attach the paths under sample["audio"] / sample["reference_audio"]. Pass a path for a persistent cache, True for a tempdir. Skipped silently for videos with no audio stream; ignored wherever audio / reference_audio is already explicit. extract_workers : Parallel workers for extract_audio.

Returns

list[dict] Canonical samples shape — hand directly to :meth:Evaluator.evaluate.

Cardinality and shape

Let N = len(generated inputs) (the agreed length of whichever of video / audio you passed). References are attached 1:1 onto the first N samples; any extras (when |ref| > N) become standalone role-tagged samples at the end of the list, so set metrics like FVD see the full reference corpus while per-sample paired metrics like LPIPS only run on the first N pairs.

Notes

"Missing" keys are simply absent from the sample dict. Metrics handle them per their own contract (sample.get(...) for optional, sample[...] raises for required, or :meth:BaseMetric._skip for opt-in skip behavior). One fat samples list with many keys can serve many metrics — each reads its subset.

Source code in fastvideo/eval/io/inputs.py
def samples_from(
    *,
    # Modality inputs — pass whichever you have, in any combination.
    video: PathSpec | None = None,
    reference: PathSpec | None = None,
    audio: PathSpec | None = None,
    reference_audio: PathSpec | None = None,
    # Per-sample attachments.  Scalars broadcast to every sample; lists
    # / jsonl paths zip per-sample.
    text_prompt: str | None = None,
    text_prompts: str | Path | list[str] | None = None,
    fps: float | None = None,
    auxiliary_info: dict | list[dict] | None = None,
    # Catch-all for exotic metric inputs (physics_iq scenarios,
    # synthetic_optical_flow actions, ...).  Single dict broadcasts;
    # list of dicts zips.  Keys merge into each sample dict.
    extras: dict | list[dict] | None = None,
    # Sugar: pull audio off the video sources via PyAV.  Pass a path
    # to use a persistent on-disk cache, ``True`` for a system tempdir.
    extract_audio: bool | str | Path = False,
    extract_workers: int = 4,
) -> list[dict]:
    """Build a samples list from path-style inputs.

    Parameters
    ----------
    video, reference, audio, reference_audio :
        File path, directory of files (sorted by name), or any iterable
        of paths.  Pass whichever modalities apply to the metrics you
        plan to run — they attach to ``sample["video"]`` /
        ``sample["reference"]`` / ``sample["audio"]`` /
        ``sample["reference_audio"]`` respectively.  Video paths are
        wrapped in :class:`Video` so :class:`VideoPool` decodes them
        lazily in parallel; audio paths stay as strings (audio metrics
        each load with their own resample / preprocess).
    text_prompt :
        A single prompt string broadcast onto every sample.
    text_prompts :
        A list of strings (one per sample), or a path to a ``.jsonl`` /
        ``.json`` file containing per-sample prompts.
    fps :
        Scalar fps broadcast onto every sample.
    auxiliary_info :
        Single dict (broadcast) or list of dicts (zipped) for
        ``sample["auxiliary_info"]`` — vbench structured-prompt
        metrics read this.
    extras :
        Catch-all per-sample attachments.  Use for metric-specific keys
        the dedicated kwargs don't cover (``scenario``, ``view``,
        ``actions``, ``calibration``, ``reference_take2``, ...).  Pass
        a single dict to broadcast or a list-of-dicts to zip; the keys
        merge into each sample dict.
    extract_audio :
        If truthy, auto-extract audio from each ``video`` /
        ``reference`` source into ``.wav`` files via PyAV and attach
        the paths under ``sample["audio"]`` / ``sample["reference_audio"]``.
        Pass a path for a persistent cache, ``True`` for a tempdir.
        Skipped silently for videos with no audio stream; ignored
        wherever ``audio`` / ``reference_audio`` is already explicit.
    extract_workers :
        Parallel workers for ``extract_audio``.

    Returns
    -------
    list[dict]
        Canonical samples shape — hand directly to
        :meth:`Evaluator.evaluate`.

    Cardinality and shape
    ---------------------
    Let ``N = len(generated inputs)`` (the agreed length of whichever
    of ``video`` / ``audio`` you passed).  References are attached
    1:1 onto the first N samples; any extras (when ``|ref| > N``)
    become standalone role-tagged samples at the end of the list, so
    set metrics like FVD see the full reference corpus while per-sample
    paired metrics like LPIPS only run on the first N pairs.

    Notes
    -----
    "Missing" keys are simply absent from the sample dict.  Metrics
    handle them per their own contract (``sample.get(...)`` for
    optional, ``sample[...]`` raises for required, or
    :meth:`BaseMetric._skip` for opt-in skip behavior).  One fat samples
    list with many keys can serve many metrics — each reads its subset.
    """
    gen_video = _expand(video, _VIDEO_EXTS) if video is not None else None
    gen_audio = _expand(audio, _AUDIO_EXTS) if audio is not None else None
    ref_video = _expand(reference, _VIDEO_EXTS) if reference is not None else None
    ref_audio = _expand(reference_audio, _AUDIO_EXTS) if reference_audio is not None else None

    if gen_video is None and gen_audio is None:
        raise ValueError("samples_from: pass at least one of video= or audio=")

    # All "generated" inputs must agree on length — they describe the
    # same N samples in different modalities.
    n_candidates = [len(x) for x in (gen_video, gen_audio) if x is not None]
    if len({*n_candidates}) != 1:
        raise ValueError(f"samples_from: generated inputs have inconsistent lengths {n_candidates}")
    n = n_candidates[0]

    prompts = _broadcast(text_prompts if text_prompts is not None else text_prompt, n, loader=_load_prompts)
    aux = _broadcast(auxiliary_info, n)
    extras_per_sample = _broadcast(extras, n)
    if (text_prompts is not None) and (text_prompt is not None):
        raise ValueError("Pass either text_prompt (broadcast) or text_prompts (per-sample), not both.")

    samples: list[dict[str, Any]] = []
    for i in range(n):
        s: dict[str, Any] = {}
        if gen_video is not None:
            s["video"] = as_video(gen_video[i])
        if gen_audio is not None:
            s["audio"] = str(gen_audio[i])
        if ref_video is not None and i < len(ref_video):
            s["reference"] = as_video(ref_video[i])
        if ref_audio is not None and i < len(ref_audio):
            s["reference_audio"] = str(ref_audio[i])
        if prompts is not None:
            s["text_prompt"] = prompts[i]
        if fps is not None:
            s["fps"] = fps
        if aux is not None:
            s["auxiliary_info"] = aux[i]
        if extras_per_sample is not None:
            s.update(extras_per_sample[i])
        samples.append(s)

    # Unmatched references → role-tagged set samples.  Per-sample paired
    # metrics skip these via the worker's role-skip rule; set metrics
    # (FVD, FAD) accumulate the features.
    if ref_video is not None and len(ref_video) > n:
        for v in ref_video[n:]:
            samples.append({"video": as_video(v), "role": "reference"})
    if ref_audio is not None and len(ref_audio) > n:
        for a in ref_audio[n:]:
            samples.append({"audio": str(a), "role": "reference"})

    if extract_audio:
        _attach_extracted_audio(samples, extract_audio, extract_workers)
    return samples

fastvideo.eval.io.sanitize_prompt

sanitize_prompt(prompt: str, max_len: int = 100) -> str

Prompt → safe filename stem.

Source code in fastvideo/eval/io/paths.py
def sanitize_prompt(prompt: str, max_len: int = 100) -> str:
    """Prompt → safe filename stem."""
    s = _INVALID_CHARS.sub("", prompt[:max_len]).strip().strip(".")
    return re.sub(r"\s+", " ", s) or "output"

Modules

fastvideo.eval.io.audio

Audio-extraction helper for video files.

Audio metrics (audio.clap_score, audio.frechet_distance, …) read file paths under sample["audio"] and do their own per-metric preprocessing (CLAP at 48 kHz, PaSST at 32 kHz, whisper at 16 kHz mono, …). When the source video carries an audio track (V2A / T2A-V model outputs), :func:samples_from(extract_audio=True) calls :func:extract_audio_track to pull a .wav next to it once per video.

Why a separate utility instead of in-pool decode: every audio metric wants a different sample rate / channel count / format, so the pool can't usefully pre-load audio into a single canonical tensor the way it pre-loads video frames. Paths-in / paths-out is the lingua franca for audio in this codebase.

Classes

fastvideo.eval.io.audio.NoAudioStreamError

Bases: ValueError

Raised when a video file has no audio stream to extract.

Functions

fastvideo.eval.io.audio.extract_audio_track
extract_audio_track(video_path: str | Path, *, output_dir: str | Path, sample_rate: int | None = None, codec: str = 'pcm_s16le') -> Path

Pull the first audio stream from video_path to a .wav in output_dir.

Idempotent — if output_dir / {stem}.wav already exists, the existing file is returned without re-decoding. This makes the helper safe to call repeatedly from parallel workers on overlapping inputs (the cache hit is the fast path).

Parameters

video_path : Source video file (anything PyAV can open: mp4, mkv, webm, …). output_dir : Directory the .wav lands in. Created if it doesn't exist. sample_rate : If set, resample to this rate. Default (None) preserves the source rate — most audio metrics resample again internally to their own target rate, so adding a resample step here would just be wasted work. codec : PCM codec for the output. pcm_s16le (default) gives 16-bit little-endian PCM, the most broadly compatible WAV format.

Returns

Path Absolute path to the extracted .wav.

Raises

NoAudioStreamError If video_path has no audio stream.

Source code in fastvideo/eval/io/audio.py
def extract_audio_track(
    video_path: str | Path,
    *,
    output_dir: str | Path,
    sample_rate: int | None = None,
    codec: str = "pcm_s16le",
) -> Path:
    """Pull the first audio stream from *video_path* to a ``.wav`` in *output_dir*.

    Idempotent — if ``output_dir / {stem}.wav`` already exists, the
    existing file is returned without re-decoding.  This makes the
    helper safe to call repeatedly from parallel workers on overlapping
    inputs (the cache hit is the fast path).

    Parameters
    ----------
    video_path :
        Source video file (anything PyAV can open: mp4, mkv, webm, …).
    output_dir :
        Directory the ``.wav`` lands in.  Created if it doesn't exist.
    sample_rate :
        If set, resample to this rate.  Default (``None``) preserves the
        source rate — most audio metrics resample again internally to
        their own target rate, so adding a resample step here would just
        be wasted work.
    codec :
        PCM codec for the output.  ``pcm_s16le`` (default) gives 16-bit
        little-endian PCM, the most broadly compatible WAV format.

    Returns
    -------
    Path
        Absolute path to the extracted ``.wav``.

    Raises
    ------
    NoAudioStreamError
        If *video_path* has no audio stream.
    """
    import av

    in_path = Path(video_path)
    out_dir = Path(output_dir)
    out_dir.mkdir(parents=True, exist_ok=True)
    out_path = (out_dir / f"{in_path.stem}.wav").resolve()
    if out_path.exists():
        return out_path

    container = av.open(str(in_path))
    try:
        audio_streams = [s for s in container.streams if s.type == "audio"]
        if not audio_streams:
            raise NoAudioStreamError(f"No audio stream in {in_path}")
        in_stream = audio_streams[0]
        target_rate = sample_rate or in_stream.rate

        out = av.open(str(out_path), "w", format="wav")
        try:
            out_stream = out.add_stream(codec, rate=target_rate)
            # Match input layout (mono / stereo / surround) — audio metrics
            # that want mono will downmix themselves.
            if in_stream.layout is not None:
                out_stream.layout = in_stream.layout
            resampler = None
            if sample_rate is not None and sample_rate != in_stream.rate:
                resampler = av.AudioResampler(format=out_stream.format, layout=out_stream.layout, rate=target_rate)
            for frame in container.decode(in_stream):
                frames_out = resampler.resample(frame) if resampler is not None else [frame]
                for f in frames_out:
                    # PyAV needs pts cleared so the encoder reassigns.
                    f.pts = None
                    for packet in out_stream.encode(f):
                        out.mux(packet)
            for packet in out_stream.encode():  # flush
                out.mux(packet)
        finally:
            out.close()
    finally:
        container.close()
    return out_path

fastvideo.eval.io.inputs

Input-shape helpers: paths → samples list.

The :class:Evaluator and every metric consume the same internal representation — a list[dict] with one dict per sample. Building that list by hand is the largest source of ceremony in user scripts. :func:samples_from is a pure function that turns path-style inputs (generated videos / audio, optional paired references, optional per-sample prompts / fps / metadata) into the canonical samples list, ready to hand to :meth:Evaluator.evaluate.

Design rules:

  • No Evaluator state, no metric introspection, no I/O beyond reading the prompts file when given. Just dict assembly.
  • "Extra" keys on a sample are free — metrics each read what they need and ignore the rest. So one fat samples list naturally serves many metrics in one Evaluator.
  • No "primary modality" axis. Each modality is a named kwarg (video=, audio=); pass whichever you have. Both is fine.
  • No mode axis. The shape of the output is determined by cardinality: when |gen| == |ref| the samples list is pair-zipped; when |gen| < |ref| the unmatched references become role-tagged set samples for corpus-shaped metrics (FVD / FAD) without disturbing per-sample paired metrics (LPIPS / PSNR / SSIM / gt_optical_flow).

Classes

Functions

fastvideo.eval.io.inputs.as_video
as_video(x: str | Path | Tensor | Video) -> Video

Coerce path/tensor/Video → :class:Video for the pool to decode.

Path strings and :class:pathlib.Path become Video(source=str(x)); the pool then calls :func:load_video on first use. Tensors become Video(source=None, frames=x) — the pool sees .frames already populated and forwards untouched. :class:Video instances pass through.

Source code in fastvideo/eval/io/inputs.py
def as_video(x: str | Path | torch.Tensor | Video) -> Video:
    """Coerce path/tensor/Video → :class:`Video` for the pool to decode.

    Path strings and :class:`pathlib.Path` become ``Video(source=str(x))``;
    the pool then calls :func:`load_video` on first use.  Tensors become
    ``Video(source=None, frames=x)`` — the pool sees ``.frames`` already
    populated and forwards untouched.  :class:`Video` instances pass through.
    """
    if isinstance(x, Video):
        return x
    if isinstance(x, str | Path):
        return Video(source=str(x))
    if isinstance(x, torch.Tensor):
        return Video(source=None, frames=x)
    raise TypeError(f"Cannot coerce {type(x).__name__} to Video")
fastvideo.eval.io.inputs.samples_from
samples_from(*, video: PathSpec | None = None, reference: PathSpec | None = None, audio: PathSpec | None = None, reference_audio: PathSpec | None = None, text_prompt: str | None = None, text_prompts: str | Path | list[str] | None = None, fps: float | None = None, auxiliary_info: dict | list[dict] | None = None, extras: dict | list[dict] | None = None, extract_audio: bool | str | Path = False, extract_workers: int = 4) -> list[dict]

Build a samples list from path-style inputs.

Parameters

video, reference, audio, reference_audio : File path, directory of files (sorted by name), or any iterable of paths. Pass whichever modalities apply to the metrics you plan to run — they attach to sample["video"] / sample["reference"] / sample["audio"] / sample["reference_audio"] respectively. Video paths are wrapped in :class:Video so :class:VideoPool decodes them lazily in parallel; audio paths stay as strings (audio metrics each load with their own resample / preprocess). text_prompt : A single prompt string broadcast onto every sample. text_prompts : A list of strings (one per sample), or a path to a .jsonl / .json file containing per-sample prompts. fps : Scalar fps broadcast onto every sample. auxiliary_info : Single dict (broadcast) or list of dicts (zipped) for sample["auxiliary_info"] — vbench structured-prompt metrics read this. extras : Catch-all per-sample attachments. Use for metric-specific keys the dedicated kwargs don't cover (scenario, view, actions, calibration, reference_take2, ...). Pass a single dict to broadcast or a list-of-dicts to zip; the keys merge into each sample dict. extract_audio : If truthy, auto-extract audio from each video / reference source into .wav files via PyAV and attach the paths under sample["audio"] / sample["reference_audio"]. Pass a path for a persistent cache, True for a tempdir. Skipped silently for videos with no audio stream; ignored wherever audio / reference_audio is already explicit. extract_workers : Parallel workers for extract_audio.

Returns

list[dict] Canonical samples shape — hand directly to :meth:Evaluator.evaluate.

Cardinality and shape

Let N = len(generated inputs) (the agreed length of whichever of video / audio you passed). References are attached 1:1 onto the first N samples; any extras (when |ref| > N) become standalone role-tagged samples at the end of the list, so set metrics like FVD see the full reference corpus while per-sample paired metrics like LPIPS only run on the first N pairs.

Notes

"Missing" keys are simply absent from the sample dict. Metrics handle them per their own contract (sample.get(...) for optional, sample[...] raises for required, or :meth:BaseMetric._skip for opt-in skip behavior). One fat samples list with many keys can serve many metrics — each reads its subset.

Source code in fastvideo/eval/io/inputs.py
def samples_from(
    *,
    # Modality inputs — pass whichever you have, in any combination.
    video: PathSpec | None = None,
    reference: PathSpec | None = None,
    audio: PathSpec | None = None,
    reference_audio: PathSpec | None = None,
    # Per-sample attachments.  Scalars broadcast to every sample; lists
    # / jsonl paths zip per-sample.
    text_prompt: str | None = None,
    text_prompts: str | Path | list[str] | None = None,
    fps: float | None = None,
    auxiliary_info: dict | list[dict] | None = None,
    # Catch-all for exotic metric inputs (physics_iq scenarios,
    # synthetic_optical_flow actions, ...).  Single dict broadcasts;
    # list of dicts zips.  Keys merge into each sample dict.
    extras: dict | list[dict] | None = None,
    # Sugar: pull audio off the video sources via PyAV.  Pass a path
    # to use a persistent on-disk cache, ``True`` for a system tempdir.
    extract_audio: bool | str | Path = False,
    extract_workers: int = 4,
) -> list[dict]:
    """Build a samples list from path-style inputs.

    Parameters
    ----------
    video, reference, audio, reference_audio :
        File path, directory of files (sorted by name), or any iterable
        of paths.  Pass whichever modalities apply to the metrics you
        plan to run — they attach to ``sample["video"]`` /
        ``sample["reference"]`` / ``sample["audio"]`` /
        ``sample["reference_audio"]`` respectively.  Video paths are
        wrapped in :class:`Video` so :class:`VideoPool` decodes them
        lazily in parallel; audio paths stay as strings (audio metrics
        each load with their own resample / preprocess).
    text_prompt :
        A single prompt string broadcast onto every sample.
    text_prompts :
        A list of strings (one per sample), or a path to a ``.jsonl`` /
        ``.json`` file containing per-sample prompts.
    fps :
        Scalar fps broadcast onto every sample.
    auxiliary_info :
        Single dict (broadcast) or list of dicts (zipped) for
        ``sample["auxiliary_info"]`` — vbench structured-prompt
        metrics read this.
    extras :
        Catch-all per-sample attachments.  Use for metric-specific keys
        the dedicated kwargs don't cover (``scenario``, ``view``,
        ``actions``, ``calibration``, ``reference_take2``, ...).  Pass
        a single dict to broadcast or a list-of-dicts to zip; the keys
        merge into each sample dict.
    extract_audio :
        If truthy, auto-extract audio from each ``video`` /
        ``reference`` source into ``.wav`` files via PyAV and attach
        the paths under ``sample["audio"]`` / ``sample["reference_audio"]``.
        Pass a path for a persistent cache, ``True`` for a tempdir.
        Skipped silently for videos with no audio stream; ignored
        wherever ``audio`` / ``reference_audio`` is already explicit.
    extract_workers :
        Parallel workers for ``extract_audio``.

    Returns
    -------
    list[dict]
        Canonical samples shape — hand directly to
        :meth:`Evaluator.evaluate`.

    Cardinality and shape
    ---------------------
    Let ``N = len(generated inputs)`` (the agreed length of whichever
    of ``video`` / ``audio`` you passed).  References are attached
    1:1 onto the first N samples; any extras (when ``|ref| > N``)
    become standalone role-tagged samples at the end of the list, so
    set metrics like FVD see the full reference corpus while per-sample
    paired metrics like LPIPS only run on the first N pairs.

    Notes
    -----
    "Missing" keys are simply absent from the sample dict.  Metrics
    handle them per their own contract (``sample.get(...)`` for
    optional, ``sample[...]`` raises for required, or
    :meth:`BaseMetric._skip` for opt-in skip behavior).  One fat samples
    list with many keys can serve many metrics — each reads its subset.
    """
    gen_video = _expand(video, _VIDEO_EXTS) if video is not None else None
    gen_audio = _expand(audio, _AUDIO_EXTS) if audio is not None else None
    ref_video = _expand(reference, _VIDEO_EXTS) if reference is not None else None
    ref_audio = _expand(reference_audio, _AUDIO_EXTS) if reference_audio is not None else None

    if gen_video is None and gen_audio is None:
        raise ValueError("samples_from: pass at least one of video= or audio=")

    # All "generated" inputs must agree on length — they describe the
    # same N samples in different modalities.
    n_candidates = [len(x) for x in (gen_video, gen_audio) if x is not None]
    if len({*n_candidates}) != 1:
        raise ValueError(f"samples_from: generated inputs have inconsistent lengths {n_candidates}")
    n = n_candidates[0]

    prompts = _broadcast(text_prompts if text_prompts is not None else text_prompt, n, loader=_load_prompts)
    aux = _broadcast(auxiliary_info, n)
    extras_per_sample = _broadcast(extras, n)
    if (text_prompts is not None) and (text_prompt is not None):
        raise ValueError("Pass either text_prompt (broadcast) or text_prompts (per-sample), not both.")

    samples: list[dict[str, Any]] = []
    for i in range(n):
        s: dict[str, Any] = {}
        if gen_video is not None:
            s["video"] = as_video(gen_video[i])
        if gen_audio is not None:
            s["audio"] = str(gen_audio[i])
        if ref_video is not None and i < len(ref_video):
            s["reference"] = as_video(ref_video[i])
        if ref_audio is not None and i < len(ref_audio):
            s["reference_audio"] = str(ref_audio[i])
        if prompts is not None:
            s["text_prompt"] = prompts[i]
        if fps is not None:
            s["fps"] = fps
        if aux is not None:
            s["auxiliary_info"] = aux[i]
        if extras_per_sample is not None:
            s.update(extras_per_sample[i])
        samples.append(s)

    # Unmatched references → role-tagged set samples.  Per-sample paired
    # metrics skip these via the worker's role-skip rule; set metrics
    # (FVD, FAD) accumulate the features.
    if ref_video is not None and len(ref_video) > n:
        for v in ref_video[n:]:
            samples.append({"video": as_video(v), "role": "reference"})
    if ref_audio is not None and len(ref_audio) > n:
        for a in ref_audio[n:]:
            samples.append({"audio": str(a), "role": "reference"})

    if extract_audio:
        _attach_extracted_audio(samples, extract_audio, extract_workers)
    return samples

fastvideo.eval.io.paths

Filesystem helpers shared by eval scripts.

Provides the prompt-sanitization, default filename convention, and (row, video_path) → eval-kwargs builder. Free functions, not a class — :class:fastvideo.eval.Evaluator is the only stateful object in the eval surface; loops live in user scripts.

Functions

fastvideo.eval.io.paths.build_eval_kwargs
build_eval_kwargs(row: dict, video_path: Path, *, fps: float = 24.0) -> dict[str, Any]

Build evaluator kwargs from a sample row + a video on disk.

Loads the video as (T,C,H,W) and adds the leading batch dim. Forwards prompt (as scalar text_prompt) and auxiliary_info (as scalar dict) when present on the row — matches the one-sample-per-call contract that the evaluator and every metric assume.

Source code in fastvideo/eval/io/paths.py
def build_eval_kwargs(row: dict, video_path: Path, *, fps: float = 24.0) -> dict[str, Any]:
    """Build evaluator kwargs from a sample row + a video on disk.

    Loads the video as ``(T,C,H,W)`` and adds the leading batch dim.
    Forwards ``prompt`` (as scalar ``text_prompt``) and
    ``auxiliary_info`` (as scalar dict) when present on the row —
    matches the one-sample-per-call contract that the evaluator and
    every metric assume.
    """
    from fastvideo.eval.io.video import load_video

    video = load_video(str(video_path))  # (T, C, H, W) in [0, 1]
    kwargs: dict[str, Any] = {
        "video": video.unsqueeze(0),  # (1, T, C, H, W)
        "fps": fps,
    }
    if "prompt" in row:
        kwargs["text_prompt"] = row["prompt"]
    aux = row.get("auxiliary_info")
    if aux:
        kwargs["auxiliary_info"] = aux
    return kwargs
fastvideo.eval.io.paths.default_filename
default_filename(row: dict, idx: int, ext: str = '.mp4') -> str

<sanitized-prompt>-<idx>.mp4 — VBench-style.

Source code in fastvideo/eval/io/paths.py
def default_filename(row: dict, idx: int, ext: str = ".mp4") -> str:
    """``<sanitized-prompt>-<idx>.mp4`` — VBench-style."""
    return f"{sanitize_prompt(row['prompt'])}-{idx}{ext}"
fastvideo.eval.io.paths.glob_videos
glob_videos(videos_dir: Path, row: dict, ext: str = '.mp4') -> list[Path]

Find every generated video for row, sorted by trailing -<idx>.

Source code in fastvideo/eval/io/paths.py
def glob_videos(videos_dir: Path, row: dict, ext: str = ".mp4") -> list[Path]:
    """Find every generated video for *row*, sorted by trailing ``-<idx>``."""
    pattern = f"{sanitize_prompt(row['prompt'])}-*{ext}"
    files = list(videos_dir.glob(pattern))

    def _idx(p: Path) -> int:
        try:
            return int(p.stem.rsplit("-", 1)[1])
        except (IndexError, ValueError):
            return -1

    return sorted(files, key=_idx)
fastvideo.eval.io.paths.sanitize_prompt
sanitize_prompt(prompt: str, max_len: int = 100) -> str

Prompt → safe filename stem.

Source code in fastvideo/eval/io/paths.py
def sanitize_prompt(prompt: str, max_len: int = 100) -> str:
    """Prompt → safe filename stem."""
    s = _INVALID_CHARS.sub("", prompt[:max_len]).strip().strip(".")
    return re.sub(r"\s+", " ", s) or "output"

fastvideo.eval.io.video

Functions

fastvideo.eval.io.video.extract_frames
extract_frames(video: Tensor, n_frames: int | None = None) -> Tensor

Uniformly sample n_frames from a (T, C, H, W) video tensor.

Source code in fastvideo/eval/io/video.py
def extract_frames(video: torch.Tensor, n_frames: int | None = None) -> torch.Tensor:
    """Uniformly sample *n_frames* from a ``(T, C, H, W)`` video tensor."""
    if n_frames is None or n_frames >= video.shape[0]:
        return video
    indices = torch.linspace(0, video.shape[0] - 1, n_frames).long()
    return video[indices]
fastvideo.eval.io.video.load_video
load_video(source: str | Tensor | list, **kwargs) -> Tensor

Load a video as a (T, C, H, W) float32 tensor in [0, 1].

Supported source types:

  • str / Path – path to .mp4 / .avi / .gif file, or a directory of frame images (sorted alphabetically).
  • torch.Tensor – returned as-is after shape validation.
  • list[PIL.Image] – stacked into a tensor.
Source code in fastvideo/eval/io/video.py
def load_video(source: str | torch.Tensor | list, **kwargs) -> torch.Tensor:
    """Load a video as a ``(T, C, H, W)`` float32 tensor in ``[0, 1]``.

    Supported *source* types:

    * **str / Path** – path to ``.mp4`` / ``.avi`` / ``.gif`` file, or a
      directory of frame images (sorted alphabetically).
    * **torch.Tensor** – returned as-is after shape validation.
    * **list[PIL.Image]** – stacked into a tensor.
    """
    if isinstance(source, torch.Tensor):
        if source.ndim != 4:
            raise ValueError(f"Expected video tensor with 4 dims (T,C,H,W), got {source.ndim}")
        return source.float()

    if isinstance(source, list):
        frames = [_pil_to_tensor(img) for img in source]
        return torch.stack(frames)

    path = Path(source)
    if path.is_dir():
        return _load_frame_dir(path)
    return _load_video_file(str(path))