Skip to content

eval

Modules

fastvideo.tests.eval.test_datasets_vbench

Smoke tests for the VBench prompt dataset and dataset registry.

Classes

Functions

fastvideo.tests.eval.test_evaluator_multi_gpu

Multi-replica eval through the public Evaluator API.

Skipped automatically when fewer than 2 CUDA devices are visible.

Functions

fastvideo.tests.eval.test_evaluator_multi_gpu.baseline_scores
baseline_scores()

Reference scores computed on a single-GPU evaluator. The multi-GPU runs must reproduce these exactly when handed the same input list — that's the only way to verify round-robin dispatch isn't dropping or reordering samples.

Source code in fastvideo/tests/eval/test_evaluator_multi_gpu.py
@pytest.fixture
def baseline_scores():
    """Reference scores computed on a single-GPU evaluator. The multi-GPU
    runs must reproduce these exactly when handed the same input list —
    that's the only way to verify round-robin dispatch isn't dropping or
    reordering samples."""
    samples = _make_samples(8)
    ev = create_evaluator(
        metrics=["common.psnr", "common.ssim"],
        device="cuda:0",
        num_gpus=1,
    )
    try:
        out = ev.evaluate(samples=samples)
    finally:
        ev.shutdown()
    return samples, out
fastvideo.tests.eval.test_evaluator_multi_gpu.test_multi_gpu_dispatch_preserves_order_and_scores
test_multi_gpu_dispatch_preserves_order_and_scores(baseline_scores)

Same samples, multi-GPU dispatch — results must match the single-GPU baseline element-for-element. This verifies (a) the round-robin doesn't reorder, (b) every sample is scored exactly once, © the workers don't share mutable state.

Source code in fastvideo/tests/eval/test_evaluator_multi_gpu.py
def test_multi_gpu_dispatch_preserves_order_and_scores(baseline_scores):
    """Same samples, multi-GPU dispatch — results must match the single-GPU
    baseline element-for-element. This verifies (a) the round-robin doesn't
    reorder, (b) every sample is scored exactly once, (c) the workers
    don't share mutable state."""
    samples, expected = baseline_scores

    ev = create_evaluator(
        metrics=["common.psnr", "common.ssim"],
        num_gpus=2,
    )
    try:
        got = ev.evaluate(samples=samples)
    finally:
        ev.shutdown()

    assert len(got) == len(expected)
    for i, (g, e) in enumerate(zip(got, expected)):
        assert set(g.keys()) == {"common.psnr", "common.ssim"}, f"row {i}"
        assert g["common.psnr"].score == pytest.approx(e["common.psnr"].score), \
            f"row {i} psnr drift"
        assert g["common.ssim"].score == pytest.approx(e["common.ssim"].score), \
            f"row {i} ssim drift"
fastvideo.tests.eval.test_evaluator_multi_gpu.test_multi_gpu_evaluator_kwargs_form_runs_on_one_replica
test_multi_gpu_evaluator_kwargs_form_runs_on_one_replica()

The kwargs form (single sample) is documented to always hit worker 0; this test pins the contract so future refactors don't accidentally fan out a single call.

Source code in fastvideo/tests/eval/test_evaluator_multi_gpu.py
def test_multi_gpu_evaluator_kwargs_form_runs_on_one_replica():
    """The kwargs form (single sample) is documented to always hit worker
    0; this test pins the contract so future refactors don't accidentally
    fan out a single call."""
    ev = create_evaluator(metrics=["common.psnr"], num_gpus=2)
    try:
        torch.manual_seed(0)
        gen = torch.rand(_T, _C, _H, _W)
        out = ev.evaluate(video=gen, reference=gen)
        assert out["common.psnr"].score > 50.0     # PSNR(x, x) is huge
    finally:
        ev.shutdown()
fastvideo.tests.eval.test_evaluator_multi_gpu.test_multi_gpu_release_cuda_memory_runs_clean
test_multi_gpu_release_cuda_memory_runs_clean()

release_cuda_memory must hit every replica without crashing.

Source code in fastvideo/tests/eval/test_evaluator_multi_gpu.py
def test_multi_gpu_release_cuda_memory_runs_clean():
    """``release_cuda_memory`` must hit every replica without crashing."""
    ev = create_evaluator(metrics=["common.psnr"], num_gpus=2)
    try:
        samples = _make_samples(2)
        _ = ev.evaluate(samples=samples)
        ev.release_cuda_memory()                   # should not raise
    finally:
        ev.shutdown()

fastvideo.tests.eval.test_evaluator_paths

Path-input variants of the public Evaluator API.

The worker boundary accepts video / reference as either a pre-loaded (T, C, H, W) tensor or a path-like (str / Path). These tests pin the path-form so future refactors don't accidentally re-require pre-loaded tensors.

Classes

Functions

fastvideo.tests.eval.test_evaluator_paths.test_dispatcher_holds_paths_not_tensors_in_queue
test_dispatcher_holds_paths_not_tensors_in_queue(tmp_path)

Memory invariant: when many paths are passed, the queued samples are tiny strings, not full tensors. Verify by checking the length of the per-sample reference set the dispatcher materializes.

Source code in fastvideo/tests/eval/test_evaluator_paths.py
def test_dispatcher_holds_paths_not_tensors_in_queue(tmp_path):
    """Memory invariant: when many paths are passed, the queued samples
    are tiny strings, not full tensors. Verify by checking the length
    of the per-sample reference set the dispatcher materializes."""
    torch.manual_seed(1)
    n = 8
    paths: list[Path] = []
    for i in range(n):
        t = torch.rand(_T, _C, _H, _W)
        p = tmp_path / f"clip_{i}.mp4"
        _write_tensor_as_mp4(t, p)
        paths.append(p)

    samples = [{"video": str(p), "reference": str(p)} for p in paths]
    # Each sample dict is just two strings — no tensor allocations until
    # the worker's _resolve_video_input runs.
    for s in samples:
        assert isinstance(s["video"], str)
        assert isinstance(s["reference"], str)

    ev = create_evaluator(metrics=["common.psnr"], device="cpu")
    try:
        out = ev.evaluate(samples=samples)
    finally:
        ev.shutdown()
    assert len(out) == n
    # Self-paired ⇒ all PSNRs should be very high.
    for row in out:
        assert row["common.psnr"].score > 50.0
fastvideo.tests.eval.test_evaluator_paths.test_missing_path_surfaces_as_exception
test_missing_path_surfaces_as_exception(evaluator, tmp_path)

Decode failures must propagate, not silently produce a None score.

Source code in fastvideo/tests/eval/test_evaluator_paths.py
def test_missing_path_surfaces_as_exception(evaluator, tmp_path):
    """Decode failures must propagate, not silently produce a None score."""
    bogus = tmp_path / "does_not_exist.mp4"
    with pytest.raises(Exception):
        evaluator.evaluate(video=str(bogus), reference=str(bogus))
fastvideo.tests.eval.test_evaluator_paths.test_one_shot_evaluate_accepts_paths
test_one_shot_evaluate_accepts_paths(video_paths)

The top-level fastvideo.eval.evaluate helper also flows paths.

Source code in fastvideo/tests/eval/test_evaluator_paths.py
def test_one_shot_evaluate_accepts_paths(video_paths):
    """The top-level ``fastvideo.eval.evaluate`` helper also flows paths."""
    paths, _ = video_paths
    out = evaluate(generated=str(paths[0]), reference=str(paths[0]),
                   metrics=["common.psnr"], device="cpu")
    assert out["common.psnr"].score > 50.0
fastvideo.tests.eval.test_evaluator_paths.test_path_form_score_matches_tensor_form
test_path_form_score_matches_tensor_form(evaluator, video_paths)

Loading via path must produce the same score as loading via the public load_video helper and passing the tensor in directly.

Source code in fastvideo/tests/eval/test_evaluator_paths.py
def test_path_form_score_matches_tensor_form(evaluator, video_paths):
    """Loading via path must produce the same score as loading via the
    public ``load_video`` helper and passing the tensor in directly."""
    from fastvideo.eval.io import load_video

    paths, _ = video_paths
    via_path = evaluator.evaluate(video=str(paths[0]), reference=str(paths[0]))
    tensor = load_video(str(paths[0]))
    via_tensor = evaluator.evaluate(video=tensor, reference=tensor)
    assert via_path["common.psnr"].score == pytest.approx(
        via_tensor["common.psnr"].score, abs=1e-4)
    assert via_path["common.ssim"].score == pytest.approx(
        via_tensor["common.ssim"].score, abs=1e-4)
fastvideo.tests.eval.test_evaluator_paths.test_samples_list_can_mix_paths_and_tensors
test_samples_list_can_mix_paths_and_tensors(evaluator, video_paths)

A single samples call can mix path and tensor entries.

Source code in fastvideo/tests/eval/test_evaluator_paths.py
def test_samples_list_can_mix_paths_and_tensors(evaluator, video_paths):
    """A single ``samples`` call can mix path and tensor entries."""
    paths, tensors = video_paths
    samples = [
        {"video": str(paths[0]), "reference": tensors[0]},   # path + tensor
        {"video": tensors[1], "reference": str(paths[1])},   # tensor + path
    ]
    out = evaluator.evaluate(samples=samples)
    assert len(out) == 2
    for row in out:
        assert row["common.psnr"].score > 0.0
fastvideo.tests.eval.test_evaluator_paths.video_paths
video_paths(tmp_path)

Two reproducible mp4s on disk + their pre-loaded tensors for parity.

Source code in fastvideo/tests/eval/test_evaluator_paths.py
@pytest.fixture
def video_paths(tmp_path):
    """Two reproducible mp4s on disk + their pre-loaded tensors for parity."""
    torch.manual_seed(0)
    paths: list[Path] = []
    tensors: list[torch.Tensor] = []
    for i in range(2):
        t = torch.rand(_T, _C, _H, _W)
        p = tmp_path / f"clip_{i}.mp4"
        _write_tensor_as_mp4(t, p)
        paths.append(p)
        tensors.append(t)
    return paths, tensors

fastvideo.tests.eval.test_evaluator_single

End-to-end tests for single-replica eval through the public API.

Runs the lightweight pixel-space metrics — common.psnr and common.ssim — under both shapes that real callers use:

  • one-shot evaluate(video=..., reference=...) (the helper in fastvideo.eval.api);
  • a long-lived Evaluator, called once per sample;
  • a long-lived Evaluator, called with a list of sample dicts to fan out (samples=[...]).

GPU-only metrics live in separate test modules / classes; everything here runs on CPU so the suite stays cheap to invoke.

Classes

Functions

fastvideo.tests.eval.test_evaluator_single.gen_ref
gen_ref()

Reproducible (gen, ref) pair shaped (T, C, H, W).

Source code in fastvideo/tests/eval/test_evaluator_single.py
@pytest.fixture
def gen_ref():
    """Reproducible (gen, ref) pair shaped (T, C, H, W)."""
    torch.manual_seed(0)
    gen = torch.rand(_T, _C, _H, _W)
    ref = torch.rand(_T, _C, _H, _W)
    return gen, ref
fastvideo.tests.eval.test_evaluator_single.test_evaluator_accepts_legacy_5d_input
test_evaluator_accepts_legacy_5d_input(evaluator, gen_ref)

Callers that still pass (1, T, C, H, W) should get unwrapped.

Source code in fastvideo/tests/eval/test_evaluator_single.py
def test_evaluator_accepts_legacy_5d_input(evaluator, gen_ref):
    """Callers that still pass ``(1, T, C, H, W)`` should get unwrapped."""
    gen, ref = gen_ref
    out = evaluator.evaluate(video=gen.unsqueeze(0), reference=ref.unsqueeze(0))
    _assert_well_formed(out["common.psnr"], "common.psnr")
fastvideo.tests.eval.test_evaluator_single.test_evaluator_psnr_identical_videos_is_high
test_evaluator_psnr_identical_videos_is_high(evaluator, gen_ref)

PSNR(x, x) is unbounded above; with our clamp it caps near 100 dB.

Source code in fastvideo/tests/eval/test_evaluator_single.py
def test_evaluator_psnr_identical_videos_is_high(evaluator, gen_ref):
    """PSNR(x, x) is unbounded above; with our clamp it caps near 100 dB."""
    gen, _ = gen_ref
    out = evaluator.evaluate(video=gen, reference=gen)
    assert out["common.psnr"].score > 50.0
    assert out["common.ssim"].score == pytest.approx(1.0, abs=1e-5)
fastvideo.tests.eval.test_evaluator_single.test_evaluator_samples_list_preserves_input_order
test_evaluator_samples_list_preserves_input_order(evaluator)

When samples=[...] is passed, results must come back per sample.

Source code in fastvideo/tests/eval/test_evaluator_single.py
def test_evaluator_samples_list_preserves_input_order(evaluator):
    """When ``samples=[...]`` is passed, results must come back per sample."""
    torch.manual_seed(1)
    samples = []
    for i in range(4):
        # Vary the reference enough that scores differ across rows.
        gen = torch.rand(_T, _C, _H, _W)
        ref = gen + 0.01 * (i + 1) * torch.rand_like(gen)
        samples.append({"video": gen, "reference": ref})

    out = evaluator.evaluate(samples=samples)
    assert isinstance(out, list)
    assert len(out) == len(samples)
    for row in out:
        assert set(row.keys()) == {"common.psnr", "common.ssim"}
        _assert_well_formed(row["common.psnr"], "common.psnr")

    # Re-running should give bit-identical scores (no nondeterministic
    # scheduling effects under single-GPU dispatch).
    out2 = evaluator.evaluate(samples=samples)
    for a, b in zip(out, out2):
        assert a["common.psnr"].score == pytest.approx(b["common.psnr"].score)

fastvideo.tests.eval.test_evaluator_with_dataset

End-to-end test: prompt dataset → Evaluator.

Mirrors the canonical user flow:

ds = get_dataset("vbench", dimensions=[...])
ev = create_evaluator(metrics=[...], device=...)
for row in ds:
    video = my_generator(row["prompt"])
    scores = ev.evaluate(video=video, **row)

We don't actually generate videos — that would pull in a diffusion model. Instead we synthesize a reproducible random tensor per row, so the test exercises the dataset-iteration → evaluator-call wiring without depending on any model weights.

Classes

Functions

fastvideo.tests.eval.test_evaluator_with_dataset.test_dataset_samples_form_through_evaluator
test_dataset_samples_form_through_evaluator()

Evaluator.evaluate(samples=[...]) is the canonical batched entry point; verify it works when the per-row dicts come from a dataset (kwargs form) rather than being hand-built in the test.

Source code in fastvideo/tests/eval/test_evaluator_with_dataset.py
def test_dataset_samples_form_through_evaluator():
    """``Evaluator.evaluate(samples=[...])`` is the canonical batched
    entry point; verify it works when the per-row dicts come from a
    dataset (kwargs form) rather than being hand-built in the test."""
    ds = get_dataset("vbench", dimensions=["color"])
    rows = [ds[i] for i in range(3)]

    samples = []
    for i, row in enumerate(rows):
        samples.append({
            "video": _synth_video(seed=i),
            "reference": _synth_video(seed=100 + i),
            **row,
        })

    ev = create_evaluator(metrics=["common.psnr"], device="cpu")
    try:
        out = ev.evaluate(samples=samples)
    finally:
        ev.shutdown()

    assert len(out) == len(rows)
    for row in out:
        assert "common.psnr" in row
        assert row["common.psnr"].score is not None
fastvideo.tests.eval.test_evaluator_with_dataset.test_vbench_dataset_full_corpus_iteration
test_vbench_dataset_full_corpus_iteration()

Iterating the whole dataset should be cheap (no evaluator calls). This guards against a future refactor that accidentally makes __iter__ do real work.

Source code in fastvideo/tests/eval/test_evaluator_with_dataset.py
def test_vbench_dataset_full_corpus_iteration():
    """Iterating the whole dataset should be cheap (no evaluator calls).
    This guards against a future refactor that accidentally makes
    ``__iter__`` do real work."""
    ds = get_dataset("vbench")
    rows = list(ds)
    assert len(rows) == len(ds)
    assert all(isinstance(r, dict) for r in rows)
fastvideo.tests.eval.test_evaluator_with_dataset.test_vbench_dataset_rows_drop_into_evaluator
test_vbench_dataset_rows_drop_into_evaluator()

Every row from the corpus must be a kwargs-friendly dict for Evaluator.evaluate: extra keys flow through without breaking the metric, and the metric returns a well-formed MetricResult.

Source code in fastvideo/tests/eval/test_evaluator_with_dataset.py
def test_vbench_dataset_rows_drop_into_evaluator():
    """Every row from the corpus must be a kwargs-friendly dict for
    ``Evaluator.evaluate``: extra keys flow through without breaking the
    metric, and the metric returns a well-formed ``MetricResult``."""
    ds = get_dataset("vbench", dimensions=["color"])
    rows = [ds[i] for i in range(3)]
    assert all("prompt" in r for r in rows)
    assert all("auxiliary_info" in r for r in rows)

    ev = create_evaluator(metrics=["common.psnr"], device="cpu")
    try:
        for i, row in enumerate(rows):
            video = _synth_video(seed=i)
            reference = _synth_video(seed=100 + i)
            # Real-world shape: pass dataset row through verbatim, plus
            # the generated/reference tensors. Extra dataset fields
            # ('prompt', 'n_samples', 'dimensions', 'auxiliary_info') are
            # ignored by common.psnr — that's the contract we want to pin.
            scores = ev.evaluate(video=video, reference=reference, **row)
            mr = scores["common.psnr"]
            assert isinstance(mr, MetricResult)
            assert mr.name == "common.psnr"
            assert mr.score is not None
    finally:
        ev.shutdown()

fastvideo.tests.eval.test_registry

Smoke tests for the metric registry surface.

These exercise the public fastvideo.eval API only: list_metrics, get_metric, and the group-resolution logic that create_evaluator(metrics="vbench") uses.

Functions

fastvideo.tests.eval.test_registry.test_create_evaluator_resolves_group_prefix
test_create_evaluator_resolves_group_prefix()

metrics="<group>" should expand to every <group>.* sub-metric.

Use the physics_iq group because it has multiple sub-metrics and none of them load model weights — the group-resolution behavior is what we're testing, not metric setup.

Source code in fastvideo/tests/eval/test_registry.py
def test_create_evaluator_resolves_group_prefix():
    """``metrics="<group>"`` should expand to every ``<group>.*`` sub-metric.

    Use the ``physics_iq`` group because it has multiple sub-metrics and
    none of them load model weights — the group-resolution behavior is
    what we're testing, not metric setup."""
    ev = create_evaluator(metrics="physics_iq", device="cpu")
    try:
        names = ev.metric_names
        assert all(n.startswith("physics_iq.") for n in names)
        # Don't pin an exact count — sub-metrics may be added later.
        assert "physics_iq.spatial_iou" in names
        assert "physics_iq.spatiotemporal_iou" in names
        assert "physics_iq.mse" in names
    finally:
        ev.shutdown()