Skip to content

stages

Classes

fastvideo.pipelines.basic.magi_human.stages.MagiHumanAudioDecodingStage

MagiHumanAudioDecodingStage(audio_vae, time_stretching: float = _UPSTREAM_AUDIO_TIME_STRETCH)

Bases: PipelineStage

Decode batch.audio_latents to a waveform using Stable Audio's VAE.

The VAE is loaded lazily by SAAudioVAEModel.sa_audio_vae_model — the first call triggers a snapshot_download (requires HF token + accepted terms on stabilityai/stable-audio-open-1.0).

Source code in fastvideo/pipelines/basic/magi_human/stages/audio_decoding.py
def __init__(
    self,
    audio_vae,
    time_stretching: float = _UPSTREAM_AUDIO_TIME_STRETCH,
) -> None:
    super().__init__()
    self.audio_vae = audio_vae
    self.time_stretching = time_stretching

fastvideo.pipelines.basic.magi_human.stages.MagiHumanDenoisingStage

MagiHumanDenoisingStage(transformer, scheduler, patch_size: tuple[int, int, int] = (1, 2, 2), video_in_channels: int = 192, audio_in_channels: int = 64, video_txt_guidance_scale: float = 5.0, audio_txt_guidance_scale: float = 5.0, cfg_number: int = 2, coords_style: str = 'v2', video_guidance_high_t_threshold: int = 500, video_guidance_low_t_value: float = 2.0)

Bases: PipelineStage

UniPC-flow joint denoising with CFG=2 over (video, audio) latents.

Source code in fastvideo/pipelines/basic/magi_human/stages/denoising.py
def __init__(
    self,
    transformer,
    scheduler,
    patch_size: tuple[int, int, int] = (1, 2, 2),
    video_in_channels: int = 192,
    audio_in_channels: int = 64,
    video_txt_guidance_scale: float = 5.0,
    audio_txt_guidance_scale: float = 5.0,
    cfg_number: int = 2,
    coords_style: str = "v2",
    video_guidance_high_t_threshold: int = 500,
    video_guidance_low_t_value: float = 2.0,
) -> None:
    super().__init__()
    self.transformer = transformer
    self.scheduler = scheduler
    self.patch_size = patch_size
    self.video_in_channels = video_in_channels
    self.audio_in_channels = audio_in_channels
    self.video_txt_guidance_scale = video_txt_guidance_scale
    self.audio_txt_guidance_scale = audio_txt_guidance_scale
    self.cfg_number = cfg_number
    self.coords_style = coords_style
    self.video_guidance_high_t_threshold = video_guidance_high_t_threshold
    self.video_guidance_low_t_value = video_guidance_low_t_value

fastvideo.pipelines.basic.magi_human.stages.MagiHumanLatentPreparationStage

MagiHumanLatentPreparationStage(vae_stride: tuple[int, int, int] = (4, 16, 16), z_dim: int = 48, patch_size: tuple[int, int, int] = (1, 2, 2), fps: int = 25, t5_gemma_target_length: int = 640, coords_style: Literal['v1', 'v2'] = 'v2', text_offset: int = 0, audio_in_channels: int = 64)

Bases: PipelineStage

Prepare latents, coords, modality maps, and padded text embed.

Source code in fastvideo/pipelines/basic/magi_human/stages/latent_preparation.py
def __init__(
    self,
    vae_stride: tuple[int, int, int] = (4, 16, 16),
    z_dim: int = 48,
    patch_size: tuple[int, int, int] = (1, 2, 2),
    fps: int = 25,
    t5_gemma_target_length: int = 640,
    coords_style: Literal["v1", "v2"] = "v2",
    text_offset: int = 0,
    audio_in_channels: int = 64,
) -> None:
    super().__init__()
    self.vae_stride = vae_stride
    self.z_dim = z_dim
    self.patch_size = patch_size
    self.fps = fps
    self.t5_gemma_target_length = t5_gemma_target_length
    self.coords_style = coords_style
    self.text_offset = text_offset
    self.audio_in_channels = audio_in_channels

fastvideo.pipelines.basic.magi_human.stages.MagiHumanReferenceImageStage

MagiHumanReferenceImageStage(vae: Any, vae_scale_factor: int = 16)

Bases: PipelineStage

Encode a TI2V reference image into the first-frame video latent.

Source code in fastvideo/pipelines/basic/magi_human/stages/reference_image.py
def __init__(self, vae: Any, vae_scale_factor: int = 16) -> None:
    super().__init__()
    self.vae = vae
    self.video_processor = VideoProcessor(vae_scale_factor=vae_scale_factor)

fastvideo.pipelines.basic.magi_human.stages.MagiHumanSRDenoisingStage

MagiHumanSRDenoisingStage(transformer, scheduler, patch_size: tuple[int, int, int] = (1, 2, 2), video_in_channels: int = 192, audio_in_channels: int = 64, sr_num_inference_steps: int = 5, sr_video_txt_guidance_scale: float = 3.5, use_cfg_trick: bool = True, cfg_trick_start_frame: int = 13, cfg_trick_value: float = 2.0, cfg_number: int = 2, coords_style: str = 'v1')

Bases: PipelineStage

Denoise only the SR video latent; audio passes through unchanged.

Source code in fastvideo/pipelines/basic/magi_human/stages/sr_denoising.py
def __init__(
    self,
    transformer,
    scheduler,
    patch_size: tuple[int, int, int] = (1, 2, 2),
    video_in_channels: int = 192,
    audio_in_channels: int = 64,
    sr_num_inference_steps: int = 5,
    sr_video_txt_guidance_scale: float = 3.5,
    use_cfg_trick: bool = True,
    cfg_trick_start_frame: int = 13,
    cfg_trick_value: float = 2.0,
    cfg_number: int = 2,
    coords_style: str = "v1",
) -> None:
    super().__init__()
    self.transformer = transformer
    self.scheduler = scheduler
    self.patch_size = patch_size
    self.video_in_channels = video_in_channels
    self.audio_in_channels = audio_in_channels
    self.sr_num_inference_steps = sr_num_inference_steps
    self.sr_video_txt_guidance_scale = sr_video_txt_guidance_scale
    self.use_cfg_trick = use_cfg_trick
    self.cfg_trick_start_frame = cfg_trick_start_frame
    self.cfg_trick_value = cfg_trick_value
    self.cfg_number = cfg_number
    self.coords_style = coords_style

fastvideo.pipelines.basic.magi_human.stages.MagiHumanSRLatentPreparationStage

MagiHumanSRLatentPreparationStage(vae: Any, vae_stride: tuple[int, int, int] = (4, 16, 16), patch_size: tuple[int, int, int] = (1, 2, 2), noise_value: int = 220, sr_audio_noise_scale: float = 0.7, sr_height: int = 512, sr_width: int = 896, vae_scale_factor: int = 16)

Bases: PipelineStage

Upsample base latents, add SR noise, and refresh SR conditioning.

Source code in fastvideo/pipelines/basic/magi_human/stages/sr_latent_preparation.py
def __init__(
    self,
    vae: Any,
    vae_stride: tuple[int, int, int] = (4, 16, 16),
    patch_size: tuple[int, int, int] = (1, 2, 2),
    noise_value: int = 220,
    sr_audio_noise_scale: float = 0.7,
    sr_height: int = 512,
    sr_width: int = 896,
    vae_scale_factor: int = 16,
) -> None:
    super().__init__()
    self.vae = vae
    self.vae_stride = vae_stride
    self.patch_size = patch_size
    self.noise_value = noise_value
    self.sr_audio_noise_scale = sr_audio_noise_scale
    self.sr_height = sr_height
    self.sr_width = sr_width
    self.sigmas = ZeroSNRDDPMDiscretization()(1000, do_append_zero=False, flip=True)
    self.video_processor = VideoProcessor(vae_scale_factor=vae_scale_factor)

Modules

fastvideo.pipelines.basic.magi_human.stages.audio_decoding

Audio decoding stage for daVinci-MagiHuman.

Takes the denoised audio latent that MagiHumanDenoisingStage leaves on batch.audio_latents and decodes it to a waveform using the Stable Audio Open 1.0 VAE. Mirrors the upstream post-process path (see MagiEvaluator.post_process in daVinci-MagiHuman/inference/pipeline/video_generate.py:503):

latent_audio.squeeze(0)                 # (L, C_latent)
audio = self.audio_vae.decode(latent_audio.T)   # (1, audio_ch, samples)
audio = audio.squeeze(0).T.cpu().numpy()        # (samples, audio_ch)
audio = resample_audio_sinc(audio, _UPSTREAM_AUDIO_TIME_STRETCH)

The stage stores the resampled waveform on batch.extra["audio"] (shape [samples, audio_channels]) and the sample rate on batch.extra["audio_sample_rate"]. FastVideo's VideoGenerator._mux_audio then reads those, writes a temp wav, and muxes it into the output mp4 via PyAV — same plumbing LTX-2 and Stable Audio use.

Classes

fastvideo.pipelines.basic.magi_human.stages.audio_decoding.MagiHumanAudioDecodingStage
MagiHumanAudioDecodingStage(audio_vae, time_stretching: float = _UPSTREAM_AUDIO_TIME_STRETCH)

Bases: PipelineStage

Decode batch.audio_latents to a waveform using Stable Audio's VAE.

The VAE is loaded lazily by SAAudioVAEModel.sa_audio_vae_model — the first call triggers a snapshot_download (requires HF token + accepted terms on stabilityai/stable-audio-open-1.0).

Source code in fastvideo/pipelines/basic/magi_human/stages/audio_decoding.py
def __init__(
    self,
    audio_vae,
    time_stretching: float = _UPSTREAM_AUDIO_TIME_STRETCH,
) -> None:
    super().__init__()
    self.audio_vae = audio_vae
    self.time_stretching = time_stretching

fastvideo.pipelines.basic.magi_human.stages.denoising

Joint-modality denoising stage for daVinci-MagiHuman base text-to-AV.

Runs the FlowUniPC denoise loop with CFG=2 over video + audio latents jointly. Text embeddings are already pad-or-trimmed to t5_gemma_target_length by MagiHumanLatentPreparationStage; the original context lengths are stashed on the batch as magi_original_text_lens / magi_original_neg_text_lens.

Classes

fastvideo.pipelines.basic.magi_human.stages.denoising.MagiHumanDenoisingStage
MagiHumanDenoisingStage(transformer, scheduler, patch_size: tuple[int, int, int] = (1, 2, 2), video_in_channels: int = 192, audio_in_channels: int = 64, video_txt_guidance_scale: float = 5.0, audio_txt_guidance_scale: float = 5.0, cfg_number: int = 2, coords_style: str = 'v2', video_guidance_high_t_threshold: int = 500, video_guidance_low_t_value: float = 2.0)

Bases: PipelineStage

UniPC-flow joint denoising with CFG=2 over (video, audio) latents.

Source code in fastvideo/pipelines/basic/magi_human/stages/denoising.py
def __init__(
    self,
    transformer,
    scheduler,
    patch_size: tuple[int, int, int] = (1, 2, 2),
    video_in_channels: int = 192,
    audio_in_channels: int = 64,
    video_txt_guidance_scale: float = 5.0,
    audio_txt_guidance_scale: float = 5.0,
    cfg_number: int = 2,
    coords_style: str = "v2",
    video_guidance_high_t_threshold: int = 500,
    video_guidance_low_t_value: float = 2.0,
) -> None:
    super().__init__()
    self.transformer = transformer
    self.scheduler = scheduler
    self.patch_size = patch_size
    self.video_in_channels = video_in_channels
    self.audio_in_channels = audio_in_channels
    self.video_txt_guidance_scale = video_txt_guidance_scale
    self.audio_txt_guidance_scale = audio_txt_guidance_scale
    self.cfg_number = cfg_number
    self.coords_style = coords_style
    self.video_guidance_high_t_threshold = video_guidance_high_t_threshold
    self.video_guidance_low_t_value = video_guidance_low_t_value

Functions

fastvideo.pipelines.basic.magi_human.stages.latent_preparation

Latent preparation stage for daVinci-MagiHuman base text-to-AV.

Produces
  • random video latent of shape [1, z_dim, latent_T, latent_H, latent_W],
  • random audio latent of shape [1, num_frames, 64] (the DiT jointly denoises both modalities),
  • padded T5-Gemma text embedding (target length 640) plus the original (pre-pad) context length, which the UniPC + CFG loop needs so the unconditional path sees the same padded length.

Also stakes out the per-token coords / modality map that the DiT consumes (replicates the reference MagiDataProxy.process_input).

Classes

fastvideo.pipelines.basic.magi_human.stages.latent_preparation.MagiHumanLatentPreparationStage
MagiHumanLatentPreparationStage(vae_stride: tuple[int, int, int] = (4, 16, 16), z_dim: int = 48, patch_size: tuple[int, int, int] = (1, 2, 2), fps: int = 25, t5_gemma_target_length: int = 640, coords_style: Literal['v1', 'v2'] = 'v2', text_offset: int = 0, audio_in_channels: int = 64)

Bases: PipelineStage

Prepare latents, coords, modality maps, and padded text embed.

Source code in fastvideo/pipelines/basic/magi_human/stages/latent_preparation.py
def __init__(
    self,
    vae_stride: tuple[int, int, int] = (4, 16, 16),
    z_dim: int = 48,
    patch_size: tuple[int, int, int] = (1, 2, 2),
    fps: int = 25,
    t5_gemma_target_length: int = 640,
    coords_style: Literal["v1", "v2"] = "v2",
    text_offset: int = 0,
    audio_in_channels: int = 64,
) -> None:
    super().__init__()
    self.vae_stride = vae_stride
    self.z_dim = z_dim
    self.patch_size = patch_size
    self.fps = fps
    self.t5_gemma_target_length = t5_gemma_target_length
    self.coords_style = coords_style
    self.text_offset = text_offset
    self.audio_in_channels = audio_in_channels
fastvideo.pipelines.basic.magi_human.stages.latent_preparation.StaticPackedInputs
StaticPackedInputs(video_tokens: Tensor, audio_tokens: Tensor, video_coords: Tensor, audio_coords: Tensor, video_mm: Tensor, audio_mm: Tensor, max_ch: int)

Step-invariant packed inputs: video+audio tokens, coords, modality map.

Computed once before the denoise loop; reused for every cond/uncond call. Text tokens are NOT included here because cond/uncond have different lengths.

Source code in fastvideo/pipelines/basic/magi_human/stages/latent_preparation.py
def __init__(
    self,
    video_tokens: torch.Tensor,
    audio_tokens: torch.Tensor,
    video_coords: torch.Tensor,
    audio_coords: torch.Tensor,
    video_mm: torch.Tensor,
    audio_mm: torch.Tensor,
    max_ch: int,
) -> None:
    self.video_tokens = video_tokens
    self.audio_tokens = audio_tokens
    self.video_coords = video_coords
    self.audio_coords = audio_coords
    self.video_mm = video_mm
    self.audio_mm = audio_mm
    self.video_token_num = video_tokens.size(0)
    self.audio_feat_len = audio_tokens.size(0)
    self.max_ch = max_ch
fastvideo.pipelines.basic.magi_human.stages.latent_preparation.StaticPackedLayout
StaticPackedLayout(video_coords: Tensor, audio_coords: Tensor, video_mm: Tensor, audio_mm: Tensor, max_ch: int, video_token_num: int, audio_feat_len: int)

Step- and value-invariant portion of the static packed inputs.

Coords, modality maps, and the channel-padding width depend only on the latent shape, audio length, channel widths, and patch sizes — all fixed for a single generation. Precompute once before the denoise loop and reuse on every step. Only the per-step token tensors must be rebuilt.

Source code in fastvideo/pipelines/basic/magi_human/stages/latent_preparation.py
def __init__(
    self,
    video_coords: torch.Tensor,
    audio_coords: torch.Tensor,
    video_mm: torch.Tensor,
    audio_mm: torch.Tensor,
    max_ch: int,
    video_token_num: int,
    audio_feat_len: int,
) -> None:
    self.video_coords = video_coords
    self.audio_coords = audio_coords
    self.video_mm = video_mm
    self.audio_mm = audio_mm
    self.max_ch = max_ch
    self.video_token_num = video_token_num
    self.audio_feat_len = audio_feat_len

Functions

fastvideo.pipelines.basic.magi_human.stages.latent_preparation.assemble_packed_inputs
assemble_packed_inputs(static: StaticPackedInputs, txt_feat: Tensor, txt_feat_len: int, coords_style: Literal['v1', 'v2'] = 'v2', text_offset: int = 0) -> tuple[Tensor, Tensor, Tensor]

Attach per-call text tokens to the precomputed static packed inputs.

Returns (token_seq, coords, modality_map) ready for the DiT.

Source code in fastvideo/pipelines/basic/magi_human/stages/latent_preparation.py
def assemble_packed_inputs(
    static: StaticPackedInputs,
    txt_feat: torch.Tensor,
    txt_feat_len: int,
    coords_style: Literal["v1", "v2"] = "v2",
    text_offset: int = 0,
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    """Attach per-call text tokens to the precomputed static packed inputs.

    Returns (token_seq, coords, modality_map) ready for the DiT.
    """
    text_tokens = txt_feat[0, :txt_feat_len].contiguous()
    max_ch = max(static.max_ch, text_tokens.size(-1))

    video_tokens = F.pad(static.video_tokens, (0, max_ch - static.video_tokens.size(-1)))
    audio_tokens = F.pad(static.audio_tokens, (0, max_ch - static.audio_tokens.size(-1)))
    text_tokens = F.pad(text_tokens, (0, max_ch - text_tokens.size(-1)))
    token_seq = torch.cat([video_tokens, audio_tokens, text_tokens], dim=0)

    device = token_seq.device
    dtype = token_seq.dtype
    text_mm = torch.full((txt_feat_len, ), MODALITY_TEXT, dtype=torch.int64, device=device)
    mm = torch.cat([static.video_mm, static.audio_mm, text_mm], dim=0)

    if coords_style == "v2":
        text_coords = _build_coords(
            shape=(txt_feat_len, 1, 1),
            ref_feat_shape=(1, 1, 1),
            offset_thw=(-txt_feat_len, 0, 0),
            device=device,
            dtype=dtype,
        )
    else:
        text_coords = _build_coords(
            shape=(txt_feat_len, 1, 1),
            ref_feat_shape=_V1_TEXT_REF_SHAPE,
            offset_thw=(text_offset, 0, 0),
            device=device,
            dtype=dtype,
        )

    coords = torch.cat([static.video_coords, static.audio_coords, text_coords], dim=0)
    return token_seq, coords, mm
fastvideo.pipelines.basic.magi_human.stages.latent_preparation.build_packed_inputs
build_packed_inputs(video_latent: Tensor, audio_latent: Tensor, audio_feat_len: int, txt_feat: Tensor, txt_feat_len: int, patch_size: tuple[int, int, int], coords_style: Literal['v1', 'v2'] = 'v2', text_offset: int = 0) -> tuple[Tensor, Tensor, Tensor]

Build the full packed token stream in one call (backwards-compat wrapper).

Equivalent to assemble_packed_inputs(build_static_packed_inputs(...), ...). Prefer calling the two helpers separately when the static portion can be reused across multiple calls (e.g. cond/uncond in the denoise loop).

Source code in fastvideo/pipelines/basic/magi_human/stages/latent_preparation.py
def build_packed_inputs(
    video_latent: torch.Tensor,
    audio_latent: torch.Tensor,
    audio_feat_len: int,
    txt_feat: torch.Tensor,
    txt_feat_len: int,
    patch_size: tuple[int, int, int],
    coords_style: Literal["v1", "v2"] = "v2",
    text_offset: int = 0,
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    """Build the full packed token stream in one call (backwards-compat wrapper).

    Equivalent to assemble_packed_inputs(build_static_packed_inputs(...), ...).
    Prefer calling the two helpers separately when the static portion can be
    reused across multiple calls (e.g. cond/uncond in the denoise loop).
    """
    static = build_static_packed_inputs(
        video_latent=video_latent,
        audio_latent=audio_latent,
        audio_feat_len=audio_feat_len,
        patch_size=patch_size,
        coords_style=coords_style,
    )
    return assemble_packed_inputs(
        static=static,
        txt_feat=txt_feat,
        txt_feat_len=txt_feat_len,
        coords_style=coords_style,
        text_offset=text_offset,
    )
fastvideo.pipelines.basic.magi_human.stages.latent_preparation.build_static_packed_inputs
build_static_packed_inputs(video_latent: Tensor, audio_latent: Tensor, audio_feat_len: int, patch_size: tuple[int, int, int], coords_style: Literal['v1', 'v2'] = 'v2', layout: StaticPackedLayout | None = None) -> StaticPackedInputs

Build the step-invariant portion of the packed token stream.

Returns video+audio tokens (padded to a common channel width), their coords, and their modality slices. Text is excluded because cond/uncond differ in length; call assemble_packed_inputs to attach text per call.

Mirrors SingleData.token_sequence / coords_mapping / modality_mapping in inference/pipeline/data_proxy.py, minus the text portion.

When layout is provided, coords / modality maps / max_ch are taken from the precomputed values and only the per-step token tensors are rebuilt; this is the hot-path call from the denoising loop. When layout is None the function recomputes everything from scratch (e.g. for one-shot tests via build_packed_inputs).

Source code in fastvideo/pipelines/basic/magi_human/stages/latent_preparation.py
def build_static_packed_inputs(
    video_latent: torch.Tensor,
    audio_latent: torch.Tensor,
    audio_feat_len: int,
    patch_size: tuple[int, int, int],
    coords_style: Literal["v1", "v2"] = "v2",
    layout: StaticPackedLayout | None = None,
) -> StaticPackedInputs:
    """Build the step-invariant portion of the packed token stream.

    Returns video+audio tokens (padded to a common channel width), their
    coords, and their modality slices. Text is excluded because cond/uncond
    differ in length; call assemble_packed_inputs to attach text per call.

    Mirrors SingleData.token_sequence / coords_mapping / modality_mapping in
    inference/pipeline/data_proxy.py, minus the text portion.

    When ``layout`` is provided, coords / modality maps / max_ch are taken
    from the precomputed values and only the per-step token tensors are
    rebuilt; this is the hot-path call from the denoising loop. When
    ``layout`` is None the function recomputes everything from scratch
    (e.g. for one-shot tests via ``build_packed_inputs``).
    """
    pT, pH, pW = patch_size
    assert video_latent.size(0) == 1, "batch size 1 required for MagiHuman base"

    video_tokens = _img2tokens(video_latent, t_patch=pT, patch=pH)[0]
    audio_tokens = audio_latent[0, :audio_feat_len].contiguous()

    if layout is not None:
        max_ch = layout.max_ch
        video_tokens = F.pad(video_tokens, (0, max_ch - video_tokens.size(-1)))
        audio_tokens = F.pad(audio_tokens, (0, max_ch - audio_tokens.size(-1)))
        return StaticPackedInputs(
            video_tokens=video_tokens,
            audio_tokens=audio_tokens,
            video_coords=layout.video_coords,
            audio_coords=layout.audio_coords,
            video_mm=layout.video_mm,
            audio_mm=layout.audio_mm,
            max_ch=max_ch,
        )

    # Slow path: rebuild every invariant from scratch. Kept for the
    # ``build_packed_inputs`` one-shot wrapper used by tests/parity helpers.
    _, z_dim, T, H, W = video_latent.shape

    max_ch = max(video_tokens.size(-1), audio_tokens.size(-1))
    video_tokens = F.pad(video_tokens, (0, max_ch - video_tokens.size(-1)))
    audio_tokens = F.pad(audio_tokens, (0, max_ch - audio_tokens.size(-1)))

    device = video_tokens.device
    dtype = video_tokens.dtype
    video_token_num = video_tokens.size(0)

    video_mm = torch.full((video_token_num, ), MODALITY_VIDEO, dtype=torch.int64, device=device)
    audio_mm = torch.full((audio_feat_len, ), MODALITY_AUDIO, dtype=torch.int64, device=device)

    video_ref_shape = (T // pT, H // pH, W // pW)
    video_coords = _build_coords(
        shape=video_ref_shape,
        ref_feat_shape=video_ref_shape,
        device=device,
        dtype=dtype,
    )

    if coords_style == "v2":
        audio_ref_t = (audio_feat_len - 1) // _AUDIO_TEMPORAL_COMPRESSION + 1
        audio_coords = _build_coords(
            shape=(audio_feat_len, 1, 1),
            ref_feat_shape=(audio_ref_t // pT, 1, 1),
            device=device,
            dtype=dtype,
        )
    else:
        audio_coords = _build_coords(
            shape=(audio_feat_len, 1, 1),
            ref_feat_shape=(T // pT, 1, 1),
            device=device,
            dtype=dtype,
        )

    return StaticPackedInputs(
        video_tokens=video_tokens,
        audio_tokens=audio_tokens,
        video_coords=video_coords,
        audio_coords=audio_coords,
        video_mm=video_mm,
        audio_mm=audio_mm,
        max_ch=max_ch,
    )
fastvideo.pipelines.basic.magi_human.stages.latent_preparation.precompute_static_packed_layout
precompute_static_packed_layout(latent_shape: tuple[int, int, int, int, int], audio_feat_len: int, z_dim: int, audio_in_channels: int, patch_size: tuple[int, int, int], coords_style: Literal['v1', 'v2'] = 'v2', device: device | None = None, dtype: dtype = float32) -> StaticPackedLayout

Precompute the invariant fields used by build_static_packed_inputs.

Arguments are derived from configs and latent shape — none depend on the current denoising-step values. Call this once in the latent preparation stage (or any pre-loop site) and pass the result via the layout= arg of build_static_packed_inputs to skip the meshgrid/full() work on every step.

Source code in fastvideo/pipelines/basic/magi_human/stages/latent_preparation.py
def precompute_static_packed_layout(
    latent_shape: tuple[int, int, int, int, int],
    audio_feat_len: int,
    z_dim: int,
    audio_in_channels: int,
    patch_size: tuple[int, int, int],
    coords_style: Literal["v1", "v2"] = "v2",
    device: torch.device | None = None,
    dtype: torch.dtype = torch.float32,
) -> StaticPackedLayout:
    """Precompute the invariant fields used by ``build_static_packed_inputs``.

    Arguments are derived from configs and latent shape — none depend on
    the current denoising-step values. Call this once in the latent
    preparation stage (or any pre-loop site) and pass the result via the
    ``layout=`` arg of ``build_static_packed_inputs`` to skip the
    meshgrid/full() work on every step.
    """
    pT, pH, pW = patch_size
    _, _, T, H, W = latent_shape
    if device is None:
        device = torch.device("cpu")

    video_token_num = (T // pT) * (H // pH) * (W // pW)
    # `_img2tokens` packs to channel `z_dim * pT * pH * pW`; audio tokens
    # are `audio_in_channels` wide — both are config constants.
    max_ch = max(z_dim * pT * pH * pW, audio_in_channels)

    video_ref_shape = (T // pT, H // pH, W // pW)
    video_coords = _build_coords(
        shape=video_ref_shape,
        ref_feat_shape=video_ref_shape,
        device=device,
        dtype=dtype,
    )

    if coords_style == "v2":
        audio_ref_t = (audio_feat_len - 1) // _AUDIO_TEMPORAL_COMPRESSION + 1
        audio_coords = _build_coords(
            shape=(audio_feat_len, 1, 1),
            ref_feat_shape=(audio_ref_t // pT, 1, 1),
            device=device,
            dtype=dtype,
        )
    else:
        audio_coords = _build_coords(
            shape=(audio_feat_len, 1, 1),
            ref_feat_shape=(T // pT, 1, 1),
            device=device,
            dtype=dtype,
        )

    video_mm = torch.full((video_token_num, ), MODALITY_VIDEO, dtype=torch.int64, device=device)
    audio_mm = torch.full((audio_feat_len, ), MODALITY_AUDIO, dtype=torch.int64, device=device)

    return StaticPackedLayout(
        video_coords=video_coords,
        audio_coords=audio_coords,
        video_mm=video_mm,
        audio_mm=audio_mm,
        max_ch=max_ch,
        video_token_num=video_token_num,
        audio_feat_len=audio_feat_len,
    )
fastvideo.pipelines.basic.magi_human.stages.latent_preparation.unpack_tokens
unpack_tokens(output: Tensor, video_token_num: int, audio_feat_len: int, video_in_channels: int, audio_in_channels: int, latent_shape: tuple[int, int, int, int, int], patch_size: tuple[int, int, int]) -> tuple[Tensor, Tensor]

Inverse of build_packed_inputs for the DiT output.

Splits the flat output back into a video latent (un-patched into B C T H W) and an audio latent (B, L, 64).

Source code in fastvideo/pipelines/basic/magi_human/stages/latent_preparation.py
def unpack_tokens(
    output: torch.Tensor,  # [L, max(V_ch, A_ch)]
    video_token_num: int,
    audio_feat_len: int,
    video_in_channels: int,
    audio_in_channels: int,
    latent_shape: tuple[int, int, int, int, int],  # [1, z_dim, T, H, W]
    patch_size: tuple[int, int, int],
) -> tuple[torch.Tensor, torch.Tensor]:
    """Inverse of `build_packed_inputs` for the DiT output.

    Splits the flat output back into a video latent (un-patched into
    B C T H W) and an audio latent (B, L, 64).
    """
    pT, pH, pW = patch_size
    _, z_dim, T, H, W = latent_shape
    tH, tW = H // pH, W // pW

    video_flat = output[:video_token_num, :video_in_channels]
    video_latent = rearrange(
        video_flat,
        "(T H W) (pT pH pW C) -> C (T pT) (H pH) (W pW)",
        H=tH,
        W=tW,
        pT=pT,
        pH=pH,
        pW=pW,
    ).contiguous().unsqueeze(0)

    audio_latent = output[
        video_token_num:video_token_num + audio_feat_len,
        :audio_in_channels,
    ].unsqueeze(0)

    return video_latent, audio_latent

fastvideo.pipelines.basic.magi_human.stages.reference_image

Reference-image encoding for MagiHuman TI2V.

The upstream daVinci-MagiHuman TI2V path encodes the user image through the Wan VAE and overwrites the first denoising latent frame with that clean latent at every step. This stage mirrors MagiEvaluator.encode_image and stashes the normalized latent on batch.image_latent for the latent-prep and denoise stages.

Classes

fastvideo.pipelines.basic.magi_human.stages.reference_image.MagiHumanReferenceImageStage
MagiHumanReferenceImageStage(vae: Any, vae_scale_factor: int = 16)

Bases: PipelineStage

Encode a TI2V reference image into the first-frame video latent.

Source code in fastvideo/pipelines/basic/magi_human/stages/reference_image.py
def __init__(self, vae: Any, vae_scale_factor: int = 16) -> None:
    super().__init__()
    self.vae = vae
    self.video_processor = VideoProcessor(vae_scale_factor=vae_scale_factor)

Functions

fastvideo.pipelines.basic.magi_human.stages.sr_denoising

SR video-only denoising stage for daVinci-MagiHuman SR-540p.

Classes

fastvideo.pipelines.basic.magi_human.stages.sr_denoising.MagiHumanSRDenoisingStage
MagiHumanSRDenoisingStage(transformer, scheduler, patch_size: tuple[int, int, int] = (1, 2, 2), video_in_channels: int = 192, audio_in_channels: int = 64, sr_num_inference_steps: int = 5, sr_video_txt_guidance_scale: float = 3.5, use_cfg_trick: bool = True, cfg_trick_start_frame: int = 13, cfg_trick_value: float = 2.0, cfg_number: int = 2, coords_style: str = 'v1')

Bases: PipelineStage

Denoise only the SR video latent; audio passes through unchanged.

Source code in fastvideo/pipelines/basic/magi_human/stages/sr_denoising.py
def __init__(
    self,
    transformer,
    scheduler,
    patch_size: tuple[int, int, int] = (1, 2, 2),
    video_in_channels: int = 192,
    audio_in_channels: int = 64,
    sr_num_inference_steps: int = 5,
    sr_video_txt_guidance_scale: float = 3.5,
    use_cfg_trick: bool = True,
    cfg_trick_start_frame: int = 13,
    cfg_trick_value: float = 2.0,
    cfg_number: int = 2,
    coords_style: str = "v1",
) -> None:
    super().__init__()
    self.transformer = transformer
    self.scheduler = scheduler
    self.patch_size = patch_size
    self.video_in_channels = video_in_channels
    self.audio_in_channels = audio_in_channels
    self.sr_num_inference_steps = sr_num_inference_steps
    self.sr_video_txt_guidance_scale = sr_video_txt_guidance_scale
    self.use_cfg_trick = use_cfg_trick
    self.cfg_trick_start_frame = cfg_trick_start_frame
    self.cfg_trick_value = cfg_trick_value
    self.cfg_number = cfg_number
    self.coords_style = coords_style

Functions

fastvideo.pipelines.basic.magi_human.stages.sr_latent_preparation

Super-resolution latent preparation for daVinci-MagiHuman SR-540p.

Classes

fastvideo.pipelines.basic.magi_human.stages.sr_latent_preparation.MagiHumanSRLatentPreparationStage
MagiHumanSRLatentPreparationStage(vae: Any, vae_stride: tuple[int, int, int] = (4, 16, 16), patch_size: tuple[int, int, int] = (1, 2, 2), noise_value: int = 220, sr_audio_noise_scale: float = 0.7, sr_height: int = 512, sr_width: int = 896, vae_scale_factor: int = 16)

Bases: PipelineStage

Upsample base latents, add SR noise, and refresh SR conditioning.

Source code in fastvideo/pipelines/basic/magi_human/stages/sr_latent_preparation.py
def __init__(
    self,
    vae: Any,
    vae_stride: tuple[int, int, int] = (4, 16, 16),
    patch_size: tuple[int, int, int] = (1, 2, 2),
    noise_value: int = 220,
    sr_audio_noise_scale: float = 0.7,
    sr_height: int = 512,
    sr_width: int = 896,
    vae_scale_factor: int = 16,
) -> None:
    super().__init__()
    self.vae = vae
    self.vae_stride = vae_stride
    self.patch_size = patch_size
    self.noise_value = noise_value
    self.sr_audio_noise_scale = sr_audio_noise_scale
    self.sr_height = sr_height
    self.sr_width = sr_width
    self.sigmas = ZeroSNRDDPMDiscretization()(1000, do_append_zero=False, flip=True)
    self.video_processor = VideoProcessor(vae_scale_factor=vae_scale_factor)
fastvideo.pipelines.basic.magi_human.stages.sr_latent_preparation.ZeroSNRDDPMDiscretization
ZeroSNRDDPMDiscretization(linear_start: float = 0.00085, linear_end: float = 0.012, num_timesteps: int = 1000, shift_scale: float = 1.0, keep_start: bool = False, post_shift: bool = False)

Upstream ZeroSNR schedule used to corrupt interpolated SR latents.

Source code in fastvideo/pipelines/basic/magi_human/stages/sr_latent_preparation.py
def __init__(
    self,
    linear_start: float = 0.00085,
    linear_end: float = 0.0120,
    num_timesteps: int = 1000,
    shift_scale: float = 1.0,
    keep_start: bool = False,
    post_shift: bool = False,
) -> None:
    if keep_start and not post_shift:
        linear_start = linear_start / (shift_scale + (1 - shift_scale) * linear_start)
    self.num_timesteps = num_timesteps
    betas = torch.linspace(
        linear_start**0.5,
        linear_end**0.5,
        num_timesteps,
        dtype=torch.float64,
    )**2
    alphas = 1.0 - betas.numpy()
    self.alphas_cumprod = np.cumprod(alphas, axis=0)
    self.post_shift = post_shift
    self.shift_scale = shift_scale

    if not post_shift:
        self.alphas_cumprod = self.alphas_cumprod / (shift_scale + (1 - shift_scale) * self.alphas_cumprod)

Functions