decoding ¶

Decoding stage for diffusion pipelines.

Classes¶

fastvideo.pipelines.stages.decoding.DecodingStage ¶

DecodingStage(vae, pipeline=None)

Bases: PipelineStage

Stage for decoding latent representations into pixel space.

This stage handles the decoding of latent representations into the final output format (e.g., pixel values).

Source code in fastvideo/pipelines/stages/decoding.py

def __init__(self, vae, pipeline=None) -> None:
    self.vae: ParallelTiledVAE = vae
    self.pipeline = weakref.ref(pipeline) if pipeline else None

Methods:¶

fastvideo.pipelines.stages.decoding.DecodingStage.decode ¶

decode(latents: Tensor, fastvideo_args: FastVideoArgs) -> Tensor

Decode latent representations into pixel space using VAE.

Parameters:

Name	Type	Description	Default
`latents`	`Tensor`	Input latent tensor with shape (batch, channels, frames, height_latents, width_latents)	required
`fastvideo_args`	`FastVideoArgs`	Configuration containing: - disable_autocast: Whether to disable automatic mixed precision (default: False) - pipeline_config.vae_precision: VAE computation precision ("fp32", "fp16", "bf16") - pipeline_config.vae_tiling: Whether to enable VAE tiling for memory efficiency	required

Returns:

Type	Description
`Tensor`	Decoded video tensor with shape (batch, channels, frames, height, width),
`Tensor`	normalized to [0, 1] range and moved to CPU as float32

Source code in fastvideo/pipelines/stages/decoding.py

@torch.no_grad()
def decode(self, latents: torch.Tensor, fastvideo_args: FastVideoArgs) -> torch.Tensor:
    """
    Decode latent representations into pixel space using VAE.

    Args:
        latents: Input latent tensor with shape (batch, channels, frames, height_latents, width_latents)
        fastvideo_args: Configuration containing:
            - disable_autocast: Whether to disable automatic mixed precision (default: False)
            - pipeline_config.vae_precision: VAE computation precision ("fp32", "fp16", "bf16")
            - pipeline_config.vae_tiling: Whether to enable VAE tiling for memory efficiency

    Returns:
        Decoded video tensor with shape (batch, channels, frames, height, width), 
        normalized to [0, 1] range and moved to CPU as float32
    """
    self.vae = self.vae.to(get_local_torch_device())
    latents = latents.to(get_local_torch_device())

    # Setup VAE precision (decode-only override falls back to vae_precision)
    decode_precision = (fastvideo_args.pipeline_config.vae_decode_precision
                        or fastvideo_args.pipeline_config.vae_precision)
    vae_dtype = PRECISION_TO_TYPE[decode_precision]
    vae_autocast_enabled = (vae_dtype != torch.float32) and not fastvideo_args.disable_autocast

    # Flux2: skip denormalize on packed latents; BN denorm runs below instead
    if not (latents.ndim == 5 and self._is_flux2_packed(latents)):
        latents = self._denormalize_latents(latents, fastvideo_args)

    # The released LingBot decoder enters Wan VAE in channels-last-3d
    # layout; matching that boundary avoids selecting a different Conv3d path.
    if (type(fastvideo_args.pipeline_config).__name__.startswith("LingBotVideo") and latents.ndim == 5):
        latents = latents.contiguous(memory_format=torch.channels_last_3d)

    # Decode latents
    with torch.autocast(device_type="cuda", dtype=vae_dtype, enabled=vae_autocast_enabled):
        if fastvideo_args.pipeline_config.vae_tiling:
            self.vae.enable_tiling()
        # if fastvideo_args.vae_sp:
        #     self.vae.enable_parallel()
        if not vae_autocast_enabled:
            latents = latents.to(vae_dtype)
        # Flux2's image VAE expects 4D (B, C, H, W); squeeze the singleton T
        # only for Flux2 packed latents. Gated on `_is_flux2_packed` so video
        # VAEs that legitimately decode 5D latents with T=1 are untouched.
        squeezed_for_vae = False
        if latents.ndim == 5 and latents.shape[2] == 1 and self._is_flux2_packed(latents):
            latents = latents.squeeze(2)
            squeezed_for_vae = True
        # Flux2 packed: BN denorm + unpatchify for VAE decode.
        # BN denorm is the complete inverse normalisation for Flux2 (no
        # scaling_factor/shift_factor step), matching Diffusers.
        if latents.ndim == 4 and self._is_flux2_packed(latents):
            latents = self._flux2_bn_denorm_and_unpatchify(latents)
        image = self.vae.decode(latents)
        # Unwrap diffusers-style DecoderOutput / tuple (Flux2 VAE returns a
        # DecoderOutput). No-op for existing VAEs that return a plain tensor.
        if hasattr(image, "sample"):
            image = image.sample
        elif isinstance(image, tuple | list):
            image = image[0]
        if squeezed_for_vae:
            image = image.unsqueeze(2)

    # Normalize image to [0, 1] range
    image = (image / 2 + 0.5).clamp(0, 1)
    return image

fastvideo.pipelines.stages.decoding.DecodingStage.forward ¶

forward(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> ForwardBatch

Decode latent representations into pixel space.

This method processes the batch through the VAE decoder, converting latent representations to pixel-space video/images. It also optionally decodes trajectory latents for visualization purposes.

Parameters:

Name	Type	Description	Default
`batch`	`ForwardBatch`	The current batch containing: - latents: Tensor to decode (batch, channels, frames, height_latents, width_latents) - return_trajectory_decoded (optional): Flag to decode trajectory latents - trajectory_latents (optional): Latents at different timesteps - trajectory_timesteps (optional): Corresponding timesteps	required
`fastvideo_args`	`FastVideoArgs`	Configuration containing: - output_type: "latent" to skip decoding, otherwise decode to pixels - vae_cpu_offload: Whether to offload VAE to CPU after decoding - model_loaded: Track VAE loading state - model_paths: Path to VAE model if loading needed	required

Returns:

Type	Description
`ForwardBatch`	Modified batch with: - output: Decoded frames (batch, channels, frames, height, width) as CPU float32 - trajectory_decoded (if requested): List of decoded frames per timestep

Source code in fastvideo/pipelines/stages/decoding.py

@torch.no_grad()
def forward(
    self,
    batch: ForwardBatch,
    fastvideo_args: FastVideoArgs,
) -> ForwardBatch:
    """
    Decode latent representations into pixel space.

    This method processes the batch through the VAE decoder, converting latent
    representations to pixel-space video/images. It also optionally decodes
    trajectory latents for visualization purposes.

    Args:
        batch: The current batch containing:
            - latents: Tensor to decode (batch, channels, frames, height_latents, width_latents)
            - return_trajectory_decoded (optional): Flag to decode trajectory latents
            - trajectory_latents (optional): Latents at different timesteps
            - trajectory_timesteps (optional): Corresponding timesteps
        fastvideo_args: Configuration containing:
            - output_type: "latent" to skip decoding, otherwise decode to pixels
            - vae_cpu_offload: Whether to offload VAE to CPU after decoding
            - model_loaded: Track VAE loading state
            - model_paths: Path to VAE model if loading needed

    Returns:
        Modified batch with:
            - output: Decoded frames (batch, channels, frames, height, width) as CPU float32
            - trajectory_decoded (if requested): List of decoded frames per timestep
    """
    # load vae if not already loaded (used for memory constrained devices)
    pipeline = self.pipeline() if self.pipeline else None
    if not fastvideo_args.model_loaded["vae"]:
        loader = VAELoader()
        self.vae = loader.load(fastvideo_args.model_paths["vae"], fastvideo_args)
        if pipeline:
            pipeline.add_module("vae", self.vae)
        fastvideo_args.model_loaded["vae"] = True

    if fastvideo_args.output_type == "latent":
        frames = batch.latents
        if frames.ndim == 5 and frames.shape[2] == 1 and self._is_flux2_packed(frames):
            frames = self._flux2_bn_denorm_and_unpatchify(frames.squeeze(2))
            frames = frames.unsqueeze(2)
    else:
        frames = self.decode(batch.latents, fastvideo_args)

    # decode trajectory latents if needed
    if batch.return_trajectory_decoded:
        batch.trajectory_decoded = []
        assert batch.trajectory_latents is not None, "batch should have trajectory latents"
        for idx in range(batch.trajectory_latents.shape[1]):
            # batch.trajectory_latents is [batch_size, timesteps, channels, frames, height, width]
            cur_latent = batch.trajectory_latents[:, idx, :, :, :, :]
            cur_timestep = batch.trajectory_timesteps[idx]
            logger.info("decoding trajectory latent for timestep: %s", cur_timestep)
            decoded_frames = self.decode(cur_latent, fastvideo_args)
            batch.trajectory_decoded.append(decoded_frames.cpu().float())

    # Convert to float32 for compatibility
    frames = frames.to(torch.float32)

    # Crop padding if this is a LongCat refinement
    if hasattr(batch, 'num_cond_frames_added') and hasattr(batch, 'new_frame_size_before_padding'):
        num_cond_frames_added = batch.num_cond_frames_added
        new_frame_size = batch.new_frame_size_before_padding
        if num_cond_frames_added > 0 or frames.shape[2] != new_frame_size:
            # frames is [B, C, T, H, W], crop temporal dimension
            frames = frames[:, :, num_cond_frames_added:num_cond_frames_added + new_frame_size, :, :]
            logger.info("Cropped LongCat refinement padding: %s:%s, final shape: %s", num_cond_frames_added,
                        num_cond_frames_added + new_frame_size, frames.shape)

    # Update batch with decoded image
    batch.output = frames

    # Offload models if needed
    if hasattr(self, 'maybe_free_model_hooks'):
        self.maybe_free_model_hooks()

    if fastvideo_args.vae_cpu_offload:
        self.vae.to("cpu")

    if torch.backends.mps.is_available():
        del self.vae
        if pipeline is not None and "vae" in pipeline.modules:
            del pipeline.modules["vae"]
        fastvideo_args.model_loaded["vae"] = False

    return batch

fastvideo.pipelines.stages.decoding.DecodingStage.streaming_decode ¶

streaming_decode(latents: Tensor, fastvideo_args: FastVideoArgs, cache: list[Tensor | None] | None = None, is_first_chunk: bool = False) -> tuple[Tensor, list[Tensor | None]]

Decode latent representations into pixel space using VAE with streaming cache.

Parameters:

Name	Type	Description	Default
`latents`	`Tensor`	Input latent tensor with shape (batch, channels, frames, height_latents, width_latents)	required
`fastvideo_args`	`FastVideoArgs`	Configuration object.	required
`cache`	`list[Tensor \| None] \| None`	VAE cache from previous call, or None to initialize a new cache.	`None`
`is_first_chunk`	`bool`	Whether this is the first chunk.	`False`

Returns:

Type	Description
`tuple[Tensor, list[Tensor \| None]]`	A tuple of (decoded_frames, updated_cache).

Source code in fastvideo/pipelines/stages/decoding.py

@torch.no_grad()
def streaming_decode(
    self,
    latents: torch.Tensor,
    fastvideo_args: FastVideoArgs,
    cache: list[torch.Tensor | None] | None = None,
    is_first_chunk: bool = False,
) -> tuple[torch.Tensor, list[torch.Tensor | None]]:
    """
    Decode latent representations into pixel space using VAE with streaming cache.

    Args:
        latents: Input latent tensor with shape (batch, channels, frames, height_latents, width_latents)
        fastvideo_args: Configuration object.
        cache: VAE cache from previous call, or None to initialize a new cache.
        is_first_chunk: Whether this is the first chunk.

    Returns:
        A tuple of (decoded_frames, updated_cache).
    """
    self.vae = self.vae.to(get_local_torch_device())
    latents = latents.to(get_local_torch_device())

    # Setup VAE precision (decode-only override falls back to vae_precision)
    decode_precision = (fastvideo_args.pipeline_config.vae_decode_precision
                        or fastvideo_args.pipeline_config.vae_precision)
    vae_dtype = PRECISION_TO_TYPE[decode_precision]
    vae_autocast_enabled = (vae_dtype != torch.float32) and not fastvideo_args.disable_autocast

    latents = self._denormalize_latents(latents, fastvideo_args)

    # Initialize cache if needed
    if cache is None:
        cache = self.vae.get_streaming_cache()

    # Decode latents with streaming
    with torch.autocast(device_type="cuda", dtype=vae_dtype, enabled=vae_autocast_enabled):
        if fastvideo_args.pipeline_config.vae_tiling:
            self.vae.enable_tiling()
        if not vae_autocast_enabled:
            latents = latents.to(vae_dtype)
        image, cache = self.vae.streaming_decode(latents, cache, is_first_chunk)

    # Normalize image to [0, 1] range
    image = (image / 2 + 0.5).clamp(0, 1)
    assert cache is not None, "cache should not be None after streaming_decode"
    return image, cache

fastvideo.pipelines.stages.decoding.DecodingStage.verify_input ¶

verify_input(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify decoding stage inputs.

Source code in fastvideo/pipelines/stages/decoding.py

def verify_input(self, batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify decoding stage inputs."""
    result = VerificationResult()
    # Denoised latents for VAE decoding: [batch_size, channels, frames, height_latents, width_latents]
    result.add_check("latents", batch.latents, [V.is_tensor, V.with_dims(5)])
    return result

fastvideo.pipelines.stages.decoding.DecodingStage.verify_output ¶

verify_output(batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult

Verify decoding stage outputs.

Source code in fastvideo/pipelines/stages/decoding.py

def verify_output(self, batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult:
    """Verify decoding stage outputs."""
    result = VerificationResult()
    # Decoded video/images: [batch_size, channels, frames, height, width]
    result.add_check("output", batch.output, [V.is_tensor, V.with_dims(5)])
    return result

decoding ¶

Classes¶

fastvideo.pipelines.stages.decoding.DecodingStage ¶

Methods:¶

fastvideo.pipelines.stages.decoding.DecodingStage.decode ¶

fastvideo.pipelines.stages.decoding.DecodingStage.forward ¶

fastvideo.pipelines.stages.decoding.DecodingStage.streaming_decode ¶

fastvideo.pipelines.stages.decoding.DecodingStage.verify_input ¶

fastvideo.pipelines.stages.decoding.DecodingStage.verify_output ¶

Functions:¶