@pytest.mark.parametrize(
"cfg",
_BENCHMARK_CONFIGS,
ids=[c["benchmark_id"] for c in _BENCHMARK_CONFIGS],
)
def test_inference_performance(cfg):
"""Measure generation latency and peak GPU memory,
assert against device-aware thresholds."""
run_config = cfg.get("run_config", {})
required_gpus = run_config.get("required_gpus", 1)
available = torch.cuda.device_count()
if available < required_gpus:
pytest.skip(f"Need {required_gpus} GPUs, only {available} available")
model_info = cfg["model"]
init_kwargs = dict(cfg.get("init_kwargs", {}))
gen_kwargs = dict(cfg.get("generation_kwargs", {}))
prompts = cfg.get("test_prompts", ["A cinematic video."])
prompt = prompts[0]
num_warmup = run_config.get("num_warmup_runs", 1)
num_measure = run_config.get("num_measurement_runs", 3)
thresholds = _get_thresholds(cfg)
# Remap JSON keys to VideoGenerator kwargs
text_enc_prec = init_kwargs.pop("text_encoder_precisions", None)
if text_enc_prec is not None:
init_kwargs["text_encoder_precisions"] = tuple(text_enc_prec)
# Output directory for generated videos
script_dir = os.path.dirname(os.path.abspath(__file__))
output_dir = os.path.join(script_dir, "generated_videos",
cfg["benchmark_id"])
os.makedirs(output_dir, exist_ok=True)
gen_kwargs["output_path"] = output_dir
generator = None
try:
generator = VideoGenerator.from_pretrained(
model_path=model_info["model_path"],
**init_kwargs,
)
for i in range(num_warmup):
logger.info("Warmup run %d/%d", i + 1, num_warmup)
_run_generation(generator, prompt, gen_kwargs)
times = []
peak_memories = []
for i in range(num_measure):
logger.info("Measurement run %d/%d", i + 1, num_measure)
elapsed, peak_mb = _run_generation(generator, prompt, gen_kwargs)
logger.info(" Time: %.2fs, Peak memory: %.0fMB", elapsed, peak_mb)
times.append(elapsed)
peak_memories.append(peak_mb)
finally:
_shutdown_executor(generator)
avg_time = sum(times) / len(times)
max_peak_memory = max(peak_memories)
device_name = torch.cuda.get_device_name()
results = {
"benchmark_id": cfg["benchmark_id"],
"model_short_name": model_info.get("model_short_name", ""),
"device": device_name,
"num_gpus": init_kwargs.get("num_gpus", 1),
"num_warmup_runs": num_warmup,
"num_measurement_runs": num_measure,
"avg_generation_time_s": round(avg_time, 3),
"individual_times_s": [round(t, 3) for t in times],
"max_peak_memory_mb": round(max_peak_memory, 1),
"individual_peak_memories_mb": [round(m, 1) for m in peak_memories],
"thresholds": thresholds,
"commit": os.environ.get("BUILDKITE_COMMIT", ""),
"pr_number": os.environ.get("BUILDKITE_PULL_REQUEST", ""),
"timestamp": datetime.now(timezone.utc).isoformat(),
}
logger.info(
"Performance results: avg_time=%.2fs, "
"max_peak_memory=%.0fMB", avg_time, max_peak_memory)
_write_results(results)
max_time = thresholds["max_generation_time_s"]
max_mem = thresholds["max_peak_memory_mb"]
assert avg_time <= max_time, (
f"Average generation time {avg_time:.2f}s exceeds "
f"threshold {max_time:.1f}s for {device_name}")
assert max_peak_memory <= max_mem, (
f"Peak memory {max_peak_memory:.0f}MB exceeds "
f"threshold {max_mem:.0f}MB for {device_name}")