pipelines
¶
Classes¶
fastvideo.configs.pipelines.Cosmos25Config
dataclass
¶
Cosmos25Config(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 0.0, flow_shift: float = 5.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = (lambda: Cosmos25VideoConfig(arch_config=Cosmos25ArchConfig(num_attention_heads=16, attention_head_dim=128, in_channels=16, out_channels=16, num_layers=28, patch_size=[1, 2, 2], max_size=[128, 240, 240], rope_scale=[1.0, 3.0, 3.0], text_embed_dim=1024, mlp_ratio=4.0, adaln_lora_dim=256, use_adaln_lora=True, concat_padding_mask=True, extra_pos_embed_type=None, use_crossattn_projection=True, rope_enable_fps_modulation=False, qk_norm='rms_norm')))(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = Cosmos25VAEConfig(), vae_precision: str = 'bf16', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (Reason1Config(arch_config=Reason1ArchConfig(embedding_concat_strategy='full_concat')),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('bf16',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (_identity_preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (reason1_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None)
fastvideo.configs.pipelines.CosmosConfig
dataclass
¶
CosmosConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: int = 6, flow_shift: float = 1.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = CosmosVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = CosmosVAEConfig(), vae_precision: str = 'fp16', vae_tiling: bool = True, vae_sp: bool = True, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (T5LargeConfig(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('bf16',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (t5_large_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None, conditioning_strategy: str = 'frame_replace', min_num_conditional_frames: int = 1, max_num_conditional_frames: int = 2, sigma_conditional: float = 0.0001, sigma_data: float = 1.0, state_ch: int = 16, state_t: int = 24, text_encoder_class: str = 'T5')
fastvideo.configs.pipelines.FastHunyuanConfig
dataclass
¶
FastHunyuanConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: int = 6, flow_shift: int = 17, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = HunyuanVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = HunyuanVAEConfig(), vae_precision: str = 'fp16', vae_tiling: bool = True, vae_sp: bool = True, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (LlamaConfig(), CLIPTextConfig()))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp16', 'fp16'))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (llama_preprocess_text, clip_preprocess_text))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], tensor], ...] = (lambda: (llama_postprocess_text, clip_postprocess_text))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None)
fastvideo.configs.pipelines.HYWorldConfig
dataclass
¶
HYWorldConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: int = 5, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = HYWorldConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = Hunyuan15VAEConfig(), vae_precision: str = 'fp16', vae_tiling: bool = True, vae_sp: bool = True, image_encoder_config: EncoderConfig = SiglipVisionConfig(), image_encoder_precision: str = 'fp16', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (Qwen2_5_VLConfig(), T5Config()))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp16', 'fp32'))(), preprocess_text_funcs: tuple[Callable[[Any], Any], ...] = (lambda: (qwen_preprocess_text, byt5_preprocess_text))(), postprocess_text_funcs: tuple[Callable[..., Any], ...] = (lambda: (qwen_postprocess_text, byt5_postprocess_text))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None, text_encoder_crop_start: int = PROMPT_TEMPLATE_TOKEN_LENGTH, text_encoder_max_lengths: tuple[int, ...] = (lambda: (1000 + PROMPT_TEMPLATE_TOKEN_LENGTH, 256))())
fastvideo.configs.pipelines.Hunyuan15T2V480PConfig
dataclass
¶
Hunyuan15T2V480PConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: int = 5, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = HunyuanVideo15Config(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = Hunyuan15VAEConfig(), vae_precision: str = 'fp16', vae_tiling: bool = True, vae_sp: bool = True, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (Qwen2_5_VLConfig(), T5Config()))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('bf16', 'fp32'))(), preprocess_text_funcs: tuple[Callable[[Any], Any], ...] = (lambda: (qwen_preprocess_text, byt5_preprocess_text))(), postprocess_text_funcs: tuple[Callable[..., Any], ...] = (lambda: (qwen_postprocess_text, byt5_postprocess_text))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None, text_encoder_crop_start: int = PROMPT_TEMPLATE_TOKEN_LENGTH, text_encoder_max_lengths: tuple[int, ...] = (lambda: (1000 + PROMPT_TEMPLATE_TOKEN_LENGTH, 256))())
fastvideo.configs.pipelines.Hunyuan15T2V720PConfig
dataclass
¶
Hunyuan15T2V720PConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: int = 9, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = HunyuanVideo15Config(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = Hunyuan15VAEConfig(), vae_precision: str = 'fp16', vae_tiling: bool = True, vae_sp: bool = True, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (Qwen2_5_VLConfig(), T5Config()))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('bf16', 'fp32'))(), preprocess_text_funcs: tuple[Callable[[Any], Any], ...] = (lambda: (qwen_preprocess_text, byt5_preprocess_text))(), postprocess_text_funcs: tuple[Callable[..., Any], ...] = (lambda: (qwen_postprocess_text, byt5_postprocess_text))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None, text_encoder_crop_start: int = PROMPT_TEMPLATE_TOKEN_LENGTH, text_encoder_max_lengths: tuple[int, ...] = (lambda: (1000 + PROMPT_TEMPLATE_TOKEN_LENGTH, 256))())
fastvideo.configs.pipelines.HunyuanConfig
dataclass
¶
HunyuanConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: int = 6, flow_shift: int = 7, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = HunyuanVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = HunyuanVAEConfig(), vae_precision: str = 'fp16', vae_tiling: bool = True, vae_sp: bool = True, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (LlamaConfig(), CLIPTextConfig()))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp16', 'fp16'))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (llama_preprocess_text, clip_preprocess_text))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], tensor], ...] = (lambda: (llama_postprocess_text, clip_postprocess_text))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None)
fastvideo.configs.pipelines.HunyuanGameCraftPipelineConfig
dataclass
¶
HunyuanGameCraftPipelineConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: int = 5, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = HunyuanGameCraftConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = GameCraftVAEConfig(), vae_precision: str = 'fp16', vae_tiling: bool = True, vae_sp: bool = True, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (LlamaConfig(), CLIPTextConfig()))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp16', 'fp16'))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (llama_preprocess_text, clip_preprocess_text))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (llama_postprocess_text, clip_postprocess_text))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None)
Bases: PipelineConfig
Configuration for HunyuanGameCraft pipeline.
Inherits text encoding from HunyuanVideo but uses: - GameCraft DiT with CameraNet - Same VAE (HunyuanVAE) - Same text encoders (LLaMA + CLIP)
fastvideo.configs.pipelines.LTX2T2VConfig
dataclass
¶
LTX2T2VConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = None, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = LTX2VideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = LTX2VAEConfig(), vae_precision: str = 'bf16', vae_tiling: bool = True, vae_sp: bool = False, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (LTX2GemmaConfig(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('bf16',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (ltx2_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None, audio_decoder_config: ModelConfig = LTX2AudioDecoderConfig(), vocoder_config: ModelConfig = LTX2VocoderConfig(), audio_decoder_precision: str = 'bf16', vocoder_precision: str = 'bf16')
fastvideo.configs.pipelines.PipelineConfig
dataclass
¶
PipelineConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = None, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = DiTConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = VAEConfig(), vae_precision: str = 'fp32', vae_tiling: bool = True, vae_sp: bool = True, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (EncoderConfig(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp32',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], tensor], ...] = (lambda: (postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None)
Base configuration for all pipeline architectures.
Functions¶
fastvideo.configs.pipelines.PipelineConfig.from_kwargs
classmethod
¶
from_kwargs(kwargs: dict[str, Any], config_cli_prefix: str = '') -> PipelineConfig
Load PipelineConfig from kwargs Dictionary. kwargs: dictionary of kwargs config_cli_prefix: prefix of CLI arguments for this PipelineConfig instance
Source code in fastvideo/configs/pipelines/base.py
fastvideo.configs.pipelines.PipelineConfig.from_pretrained
classmethod
¶
from_pretrained(model_path: str) -> PipelineConfig
use the pipeline class setting from model_path to match the pipeline config
Source code in fastvideo/configs/pipelines/base.py
fastvideo.configs.pipelines.WanI2V480PConfig
dataclass
¶
WanI2V480PConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = 3.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = WanVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = WanVAEConfig(), vae_precision: str = 'fp32', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = CLIPVisionConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (T5Config(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp32',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (t5_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None, precision: str = 'bf16', warp_denoising_step: bool = True)
fastvideo.configs.pipelines.WanI2V720PConfig
dataclass
¶
WanI2V720PConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = 5.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = WanVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = WanVAEConfig(), vae_precision: str = 'fp32', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = CLIPVisionConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (T5Config(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp32',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (t5_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None, precision: str = 'bf16', warp_denoising_step: bool = True)
fastvideo.configs.pipelines.WanT2V480PConfig
dataclass
¶
WanT2V480PConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = 3.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = WanVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = WanVAEConfig(), vae_precision: str = 'fp32', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (T5Config(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp32',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (t5_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None, precision: str = 'bf16', warp_denoising_step: bool = True)
fastvideo.configs.pipelines.WanT2V720PConfig
dataclass
¶
WanT2V720PConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = 5.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = WanVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = WanVAEConfig(), vae_precision: str = 'fp32', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (T5Config(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp32',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (t5_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None, precision: str = 'bf16', warp_denoising_step: bool = True)
Modules¶
fastvideo.configs.pipelines.base
¶
Classes¶
fastvideo.configs.pipelines.base.PipelineConfig
dataclass
¶
PipelineConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = None, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = DiTConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = VAEConfig(), vae_precision: str = 'fp32', vae_tiling: bool = True, vae_sp: bool = True, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (EncoderConfig(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp32',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], tensor], ...] = (lambda: (postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None)
Base configuration for all pipeline architectures.
Functions¶
fastvideo.configs.pipelines.base.PipelineConfig.from_kwargs
classmethod
¶from_kwargs(kwargs: dict[str, Any], config_cli_prefix: str = '') -> PipelineConfig
Load PipelineConfig from kwargs Dictionary. kwargs: dictionary of kwargs config_cli_prefix: prefix of CLI arguments for this PipelineConfig instance
Source code in fastvideo/configs/pipelines/base.py
fastvideo.configs.pipelines.base.PipelineConfig.from_pretrained
classmethod
¶from_pretrained(model_path: str) -> PipelineConfig
use the pipeline class setting from model_path to match the pipeline config
Source code in fastvideo/configs/pipelines/base.py
Functions¶
fastvideo.configs.pipelines.base.parse_int_list
¶
Parse a comma-separated string of integers into a list.
fastvideo.configs.pipelines.cosmos
¶
Classes¶
fastvideo.configs.pipelines.cosmos.CosmosConfig
dataclass
¶
CosmosConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: int = 6, flow_shift: float = 1.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = CosmosVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = CosmosVAEConfig(), vae_precision: str = 'fp16', vae_tiling: bool = True, vae_sp: bool = True, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (T5LargeConfig(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('bf16',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (t5_large_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None, conditioning_strategy: str = 'frame_replace', min_num_conditional_frames: int = 1, max_num_conditional_frames: int = 2, sigma_conditional: float = 0.0001, sigma_data: float = 1.0, state_ch: int = 16, state_t: int = 24, text_encoder_class: str = 'T5')
Functions¶
fastvideo.configs.pipelines.cosmos.t5_large_postprocess_text
¶
Postprocess T5 Large text encoder outputs for Cosmos pipeline.
Return raw last_hidden_state without truncation/padding.
Source code in fastvideo/configs/pipelines/cosmos.py
fastvideo.configs.pipelines.cosmos2_5
¶
Classes¶
fastvideo.configs.pipelines.cosmos2_5.Cosmos25Config
dataclass
¶
Cosmos25Config(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 0.0, flow_shift: float = 5.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = (lambda: Cosmos25VideoConfig(arch_config=Cosmos25ArchConfig(num_attention_heads=16, attention_head_dim=128, in_channels=16, out_channels=16, num_layers=28, patch_size=[1, 2, 2], max_size=[128, 240, 240], rope_scale=[1.0, 3.0, 3.0], text_embed_dim=1024, mlp_ratio=4.0, adaln_lora_dim=256, use_adaln_lora=True, concat_padding_mask=True, extra_pos_embed_type=None, use_crossattn_projection=True, rope_enable_fps_modulation=False, qk_norm='rms_norm')))(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = Cosmos25VAEConfig(), vae_precision: str = 'bf16', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (Reason1Config(arch_config=Reason1ArchConfig(embedding_concat_strategy='full_concat')),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('bf16',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (_identity_preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (reason1_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None)
fastvideo.configs.pipelines.gen3c
¶
Classes¶
fastvideo.configs.pipelines.gen3c.Gen3CConfig
dataclass
¶
Gen3CConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: int = 6, flow_shift: float = 1.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = Gen3CVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = Gen3CVAEConfig(), vae_precision: str = 'bf16', vae_tiling: bool = True, vae_sp: bool = True, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (_Gen3CT5LargeConfig(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('bf16',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (t5_large_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None, conditioning_strategy: str = 'frame_replace', min_num_conditional_frames: int = 1, max_num_conditional_frames: int = 2, sigma_conditional: float = 0.001, sigma_data: float = 0.5, state_ch: int = 16, state_t: int = 16, text_encoder_class: str = 'T5', frame_buffer_max: int = 2, noise_aug_strength: float = 0.0, filter_points_threshold: float = 0.05, use_moge_depth: bool = True, moge_model_name: str = 'Ruicheng/moge-vitl', offload_moge_after_depth: bool = True, default_trajectory_type: str = 'left', default_movement_distance: float = 0.3, default_camera_rotation: str = 'center_facing', video_resolution: tuple[int, int] = (704, 1280), num_frames: int = 121, fps: int = 24, cfg_behavior: str = 'legacy', default_negative_prompt: str = 'The video captures a series of frames showing ugly scenes, static with no motion, motion blur, over-saturation, shaky footage, low resolution, grainy texture, pixelated images, poorly lit areas, underexposed and overexposed scenes, poor color balance, washed out colors, choppy sequences, jerky movements, low frame rate, artifacting, color banding, unnatural transitions, outdated special effects, fake elements, unconvincing visuals, poorly edited content, jump cuts, visual noise, and flickering. Overall, the video is of poor quality.', autoregressive_chunk_frames: int = 121, autoregressive_overlap_frames: int = 1)
Bases: PipelineConfig
Configuration for GEN3C Video Generation Pipeline.
GEN3C extends Cosmos with 3D cache for camera-controlled video generation. Key parameters: - frame_buffer_max: Number of 3D cache buffers (default: 2) - noise_aug_strength: Strength of noise augmentation per buffer - filter_points_threshold: Threshold for filtering unreliable depth points
fastvideo.configs.pipelines.gen3c.Gen3CInferenceConfig
dataclass
¶
Gen3CInferenceConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: int = 6, flow_shift: float = 1.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = Gen3CVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = Gen3CVAEConfig(), vae_precision: str = 'bf16', vae_tiling: bool = True, vae_sp: bool = True, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (_Gen3CT5LargeConfig(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('bf16',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (t5_large_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None, conditioning_strategy: str = 'frame_replace', min_num_conditional_frames: int = 1, max_num_conditional_frames: int = 2, sigma_conditional: float = 0.001, sigma_data: float = 0.5, state_ch: int = 16, state_t: int = 16, text_encoder_class: str = 'T5', frame_buffer_max: int = 2, noise_aug_strength: float = 0.0, filter_points_threshold: float = 0.05, use_moge_depth: bool = True, moge_model_name: str = 'Ruicheng/moge-vitl', offload_moge_after_depth: bool = True, default_trajectory_type: str = 'left', default_movement_distance: float = 0.3, default_camera_rotation: str = 'center_facing', video_resolution: tuple[int, int] = (704, 1280), num_frames: int = 121, fps: int = 24, cfg_behavior: str = 'legacy', default_negative_prompt: str = 'The video captures a series of frames showing ugly scenes, static with no motion, motion blur, over-saturation, shaky footage, low resolution, grainy texture, pixelated images, poorly lit areas, underexposed and overexposed scenes, poor color balance, washed out colors, choppy sequences, jerky movements, low frame rate, artifacting, color banding, unnatural transitions, outdated special effects, fake elements, unconvincing visuals, poorly edited content, jump cuts, visual noise, and flickering. Overall, the video is of poor quality.', autoregressive_chunk_frames: int = 121, autoregressive_overlap_frames: int = 1, batch_size: int = 1, gradient_checkpointing: bool = False, guidance_scale: float = 1.0, num_inference_steps: int = 35)
Functions¶
fastvideo.configs.pipelines.gen3c.t5_large_postprocess_text
¶
Postprocess T5 Large text encoder outputs for GEN3C pipeline.
Return raw last_hidden_state without truncation/padding.
Source code in fastvideo/configs/pipelines/gen3c.py
fastvideo.configs.pipelines.hunyuan
¶
Classes¶
fastvideo.configs.pipelines.hunyuan.FastHunyuanConfig
dataclass
¶
FastHunyuanConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: int = 6, flow_shift: int = 17, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = HunyuanVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = HunyuanVAEConfig(), vae_precision: str = 'fp16', vae_tiling: bool = True, vae_sp: bool = True, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (LlamaConfig(), CLIPTextConfig()))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp16', 'fp16'))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (llama_preprocess_text, clip_preprocess_text))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], tensor], ...] = (lambda: (llama_postprocess_text, clip_postprocess_text))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None)
fastvideo.configs.pipelines.hunyuan.HunyuanConfig
dataclass
¶
HunyuanConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: int = 6, flow_shift: int = 7, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = HunyuanVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = HunyuanVAEConfig(), vae_precision: str = 'fp16', vae_tiling: bool = True, vae_sp: bool = True, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (LlamaConfig(), CLIPTextConfig()))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp16', 'fp16'))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (llama_preprocess_text, clip_preprocess_text))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], tensor], ...] = (lambda: (llama_postprocess_text, clip_postprocess_text))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None)
fastvideo.configs.pipelines.hunyuan15
¶
Classes¶
fastvideo.configs.pipelines.hunyuan15.Hunyuan15I2V720PConfig
dataclass
¶
Hunyuan15I2V720PConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: int = 7, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = HunyuanVideo15Config(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = Hunyuan15VAEConfig(), vae_precision: str = 'fp16', vae_tiling: bool = True, vae_sp: bool = True, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (Qwen2_5_VLConfig(), T5Config()))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('bf16', 'fp32'))(), preprocess_text_funcs: tuple[Callable[[Any], Any], ...] = (lambda: (qwen_preprocess_text, byt5_preprocess_text))(), postprocess_text_funcs: tuple[Callable[..., Any], ...] = (lambda: (qwen_postprocess_text, byt5_postprocess_text))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None, text_encoder_crop_start: int = PROMPT_TEMPLATE_TOKEN_LENGTH, text_encoder_max_lengths: tuple[int, ...] = (lambda: (1000 + PROMPT_TEMPLATE_TOKEN_LENGTH, 256))())
fastvideo.configs.pipelines.hunyuan15.Hunyuan15SR1080PConfig
dataclass
¶
Hunyuan15SR1080PConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: int = 7, flow_shift_sr: int = 2, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = HunyuanVideo15Config(), dit_precision: str = 'bf16', upsampler_config: tuple[UpsamplerConfig, ...] = (lambda: (SRTo720pUpsamplerConfig(), SRTo1080pUpsamplerConfig()))(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = Hunyuan15VAEConfig(), vae_precision: str = 'fp16', vae_tiling: bool = True, vae_sp: bool = True, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (Qwen2_5_VLConfig(), T5Config()))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('bf16', 'fp32'))(), preprocess_text_funcs: tuple[Callable[[Any], Any], ...] = (lambda: (qwen_preprocess_text, byt5_preprocess_text))(), postprocess_text_funcs: tuple[Callable[..., Any], ...] = (lambda: (qwen_postprocess_text, byt5_postprocess_text))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None, text_encoder_crop_start: int = PROMPT_TEMPLATE_TOKEN_LENGTH, text_encoder_max_lengths: tuple[int, ...] = (lambda: (1000 + PROMPT_TEMPLATE_TOKEN_LENGTH, 256))())
fastvideo.configs.pipelines.hunyuan15.Hunyuan15T2V480PConfig
dataclass
¶
Hunyuan15T2V480PConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: int = 5, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = HunyuanVideo15Config(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = Hunyuan15VAEConfig(), vae_precision: str = 'fp16', vae_tiling: bool = True, vae_sp: bool = True, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (Qwen2_5_VLConfig(), T5Config()))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('bf16', 'fp32'))(), preprocess_text_funcs: tuple[Callable[[Any], Any], ...] = (lambda: (qwen_preprocess_text, byt5_preprocess_text))(), postprocess_text_funcs: tuple[Callable[..., Any], ...] = (lambda: (qwen_postprocess_text, byt5_postprocess_text))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None, text_encoder_crop_start: int = PROMPT_TEMPLATE_TOKEN_LENGTH, text_encoder_max_lengths: tuple[int, ...] = (lambda: (1000 + PROMPT_TEMPLATE_TOKEN_LENGTH, 256))())
fastvideo.configs.pipelines.hunyuan15.Hunyuan15T2V720PConfig
dataclass
¶
Hunyuan15T2V720PConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: int = 9, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = HunyuanVideo15Config(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = Hunyuan15VAEConfig(), vae_precision: str = 'fp16', vae_tiling: bool = True, vae_sp: bool = True, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (Qwen2_5_VLConfig(), T5Config()))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('bf16', 'fp32'))(), preprocess_text_funcs: tuple[Callable[[Any], Any], ...] = (lambda: (qwen_preprocess_text, byt5_preprocess_text))(), postprocess_text_funcs: tuple[Callable[..., Any], ...] = (lambda: (qwen_postprocess_text, byt5_postprocess_text))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None, text_encoder_crop_start: int = PROMPT_TEMPLATE_TOKEN_LENGTH, text_encoder_max_lengths: tuple[int, ...] = (lambda: (1000 + PROMPT_TEMPLATE_TOKEN_LENGTH, 256))())
Functions¶
fastvideo.configs.pipelines.hunyuan15.extract_glyph_texts
¶
Extract glyph texts from prompt using regex pattern.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
prompt
|
str
|
Input prompt string |
required |
Returns:
| Type | Description |
|---|---|
str | None
|
List of extracted glyph texts |
Source code in fastvideo/configs/pipelines/hunyuan15.py
fastvideo.configs.pipelines.hunyuan15.format_text_input
¶
Apply text to template.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
prompt
|
List[str]
|
Input text. |
required |
system_message
|
str
|
System message. |
required |
Returns:
| Type | Description |
|---|---|
list[dict[str, Any]]
|
List[Dict[str, Any]]: List of chat conversation. |
Source code in fastvideo/configs/pipelines/hunyuan15.py
fastvideo.configs.pipelines.hunyuangamecraft
¶
Pipeline configuration for HunyuanGameCraft.
HunyuanGameCraft extends HunyuanVideo with: 1. CameraNet for camera/action conditioning (Plücker coordinates) 2. Mask-based conditioning for autoregressive generation 3. 33 input channels (16 latent + 16 gt_latent + 1 mask)
Text encoders are the same as HunyuanVideo: - LLaVA-LLaMA-3-8B for primary text encoding (4096 dim) - CLIP ViT-L/14 for secondary pooled embeddings (768 dim)
Classes¶
fastvideo.configs.pipelines.hunyuangamecraft.HunyuanGameCraftPipelineConfig
dataclass
¶
HunyuanGameCraftPipelineConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: int = 5, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = HunyuanGameCraftConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = GameCraftVAEConfig(), vae_precision: str = 'fp16', vae_tiling: bool = True, vae_sp: bool = True, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (LlamaConfig(), CLIPTextConfig()))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp16', 'fp16'))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (llama_preprocess_text, clip_preprocess_text))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (llama_postprocess_text, clip_postprocess_text))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None)
Bases: PipelineConfig
Configuration for HunyuanGameCraft pipeline.
Inherits text encoding from HunyuanVideo but uses: - GameCraft DiT with CameraNet - Same VAE (HunyuanVAE) - Same text encoders (LLaMA + CLIP)
fastvideo.configs.pipelines.hyworld
¶
Classes¶
fastvideo.configs.pipelines.hyworld.HYWorldConfig
dataclass
¶
HYWorldConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: int = 5, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = HYWorldConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = Hunyuan15VAEConfig(), vae_precision: str = 'fp16', vae_tiling: bool = True, vae_sp: bool = True, image_encoder_config: EncoderConfig = SiglipVisionConfig(), image_encoder_precision: str = 'fp16', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (Qwen2_5_VLConfig(), T5Config()))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp16', 'fp32'))(), preprocess_text_funcs: tuple[Callable[[Any], Any], ...] = (lambda: (qwen_preprocess_text, byt5_preprocess_text))(), postprocess_text_funcs: tuple[Callable[..., Any], ...] = (lambda: (qwen_postprocess_text, byt5_postprocess_text))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None, text_encoder_crop_start: int = PROMPT_TEMPLATE_TOKEN_LENGTH, text_encoder_max_lengths: tuple[int, ...] = (lambda: (1000 + PROMPT_TEMPLATE_TOKEN_LENGTH, 256))())
fastvideo.configs.pipelines.longcat
¶
Classes¶
fastvideo.configs.pipelines.longcat.LongCatDiTArchConfig
dataclass
¶
LongCatDiTArchConfig(stacked_params_mapping: list[tuple[str, str, str]] = list(), _fsdp_shard_conditions: list = list(), _compile_conditions: list = list(), param_names_mapping: dict = dict(), reverse_param_names_mapping: dict = dict(), lora_param_names_mapping: dict = dict(), _supported_attention_backends: tuple[AttentionBackendEnum, ...] = (SAGE_ATTN, FLASH_ATTN, TORCH_SDPA, VIDEO_SPARSE_ATTN, VMOBA_ATTN, SAGE_ATTN_THREE, SLA_ATTN, SAGE_SLA_ATTN), hidden_size: int = 0, num_attention_heads: int = 0, num_channels_latents: int = 0, in_channels: int = 16, out_channels: int = 16, exclude_lora_layers: list[str] = list(), boundary_ratio: float | None = None, adaln_tembed_dim: int = 512, caption_channels: int = 4096, depth: int = 48, enable_bsa: bool = False, enable_flashattn3: bool = False, enable_flashattn2: bool = True, enable_xformers: bool = False, frequency_embedding_size: int = 256, mlp_ratio: int = 4, num_heads: int = 32, text_tokens_zero_pad: bool = True, patch_size: list[int] = (lambda: [1, 2, 2])(), cp_split_hw: list[int] | None = None, bsa_params: dict | None = None)
Bases: DiTArchConfig
Extended DiTArchConfig with LongCat-specific fields.
fastvideo.configs.pipelines.longcat.LongCatT2V480PConfig
dataclass
¶
LongCatT2V480PConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = None, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = (lambda: DiTConfig(arch_config=LongCatDiTArchConfig()))(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = WanVAEConfig(), vae_precision: str = 'bf16', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[T5Config, ...] = (lambda: (LongCatT5Config(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('bf16',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (longcat_preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (umt5_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None, enable_kv_cache: bool = True, offload_kv_cache: bool = False, enable_bsa: bool = False, use_distill: bool = False, enhance_hf: bool = False, bsa_params: dict | None = None, bsa_sparsity: float | None = None, bsa_cdf_threshold: float | None = None, bsa_chunk_q: list[int] | None = None, bsa_chunk_k: list[int] | None = None, t_thresh: float | None = None)
Bases: PipelineConfig
Configuration for LongCat pipeline (480p).
Components expected by loaders
- tokenizer: AutoTokenizer
- text_encoder: UMT5EncoderModel
- transformer: LongCatTransformer3DModel
- vae: AutoencoderKLWan (Wan VAE, 4x8 compression)
- scheduler: FlowMatchEulerDiscreteScheduler
fastvideo.configs.pipelines.longcat.LongCatT2V704PConfig
dataclass
¶
LongCatT2V704PConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = None, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = (lambda: DiTConfig(arch_config=LongCatDiTArchConfig()))(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = WanVAEConfig(), vae_precision: str = 'bf16', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[T5Config, ...] = (lambda: (LongCatT5Config(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('bf16',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (longcat_preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (umt5_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None, enable_kv_cache: bool = True, offload_kv_cache: bool = False, enable_bsa: bool = True, use_distill: bool = False, enhance_hf: bool = False, bsa_params: dict | None = None, bsa_sparsity: float | None = None, bsa_cdf_threshold: float | None = None, bsa_chunk_q: list[int] | None = None, bsa_chunk_k: list[int] | None = None, t_thresh: float | None = None)
Bases: LongCatT2V480PConfig
Configuration for LongCat pipeline (704p) with BSA enabled by default.
Uses the same resolution and BSA parameters as original LongCat refinement stage.
BSA parameters configured in transformer config.json with chunk_3d_shape=[4,4,4]:
- Input: 704×1280×96
- VAE (8x): 88×160×96
- Patch [1,2,2]: 44×80×96
- chunk [4,4,4]: 96%4=0, 44%4=0, 80%4=0 ✅
This configuration matches the original LongCat refinement stage parameters.
fastvideo.configs.pipelines.longcat.LongCatT5ArchConfig
dataclass
¶
LongCatT5ArchConfig(stacked_params_mapping: list[tuple[str, str, str]] = (lambda: [('.qkv_proj', '.q', 'q'), ('.qkv_proj', '.k', 'k'), ('.qkv_proj', '.v', 'v')])(), architectures: list[str] = (lambda: [])(), _supported_attention_backends: tuple[AttentionBackendEnum, ...] = (FLASH_ATTN, TORCH_SDPA), output_hidden_states: bool = False, use_return_dict: bool = True, vocab_size: int = 32128, hidden_size: int = 0, num_hidden_layers: int = 0, num_attention_heads: int = 0, pad_token_id: int = 0, eos_token_id: int = 1, text_len: int = 512, hidden_state_skip_layer: int = 0, decoder_start_token_id: int = 0, output_past: bool = True, scalable_attention: bool = True, tie_word_embeddings: bool = False, tokenizer_kwargs: dict[str, Any] = dict(), _fsdp_shard_conditions: list = (lambda: [_is_transformer_layer, _is_embeddings, _is_final_layernorm])(), d_model: int = 512, d_kv: int = 64, d_ff: int = 2048, num_layers: int = 6, num_decoder_layers: int | None = None, num_heads: int = 8, relative_attention_num_buckets: int = 32, relative_attention_max_distance: int = 128, dropout_rate: float = 0.1, layer_norm_epsilon: float = 1e-06, initializer_factor: float = 1.0, feed_forward_proj: str = 'relu', dense_act_fn: str = '', is_gated_act: bool = False, is_encoder_decoder: bool = True, use_cache: bool = True, classifier_dropout: float = 0.0, dtype: str | None = None, gradient_checkpointing: bool = False)
Bases: T5ArchConfig
T5 arch that pads tokenizer output to max_length.
LongCat's denoising stage concatenates positive and negative
attention masks along the batch dimension for CFG, which requires
uniform seq length. The shared :class:T5ArchConfig dropped the
"padding": "max_length" tokenizer kwarg so other DiTs could run
with variable-length masks; LongCat still needs the uniform
contract.
Functions¶
fastvideo.configs.pipelines.longcat.longcat_preprocess_text
¶
Clean and preprocess text like original LongCat implementation.
This function applies the same text cleaning pipeline as the original LongCat-Video implementation to ensure identical tokenization results.
Steps: 1. basic_clean: Fix unicode issues and unescape HTML entities 2. whitespace_clean: Normalize whitespace to single spaces
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
prompt
|
str
|
Raw input text prompt |
required |
Returns:
| Type | Description |
|---|---|
str
|
Cleaned and normalized text prompt |
Source code in fastvideo/configs/pipelines/longcat.py
fastvideo.configs.pipelines.longcat.umt5_postprocess_text
¶
Postprocess UMT5/T5 encoder outputs to fixed length 512 embeddings.
Source code in fastvideo/configs/pipelines/longcat.py
fastvideo.configs.pipelines.turbodiffusion
¶
TurboDiffusion pipeline configurations.
TurboDiffusion uses RCM (recurrent Consistency Model) scheduler with SLA (Sparse-Linear Attention) for fast 1-4 step video generation.
Classes¶
fastvideo.configs.pipelines.turbodiffusion.TurboDiffusionI2VConfig
dataclass
¶
TurboDiffusionI2VConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = 5.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = WanVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = WanVAEConfig(), vae_precision: str = 'fp32', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = CLIPVisionConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (T5Config(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp32',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (t5_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = 0.9, precision: str = 'bf16', warp_denoising_step: bool = True)
Bases: PipelineConfig
Base configuration for TurboDiffusion I2V pipeline.
Uses RCM scheduler with sigma_max=200 for 1-4 step generation. Uses boundary_ratio=0.9 for high-noise to low-noise model switching.
fastvideo.configs.pipelines.turbodiffusion.TurboDiffusionI2V_A14B_Config
dataclass
¶
TurboDiffusionI2V_A14B_Config(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = 5.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = WanVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = WanVAEConfig(), vae_precision: str = 'fp32', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = CLIPVisionConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (T5Config(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp32',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (t5_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = 0.9, precision: str = 'bf16', warp_denoising_step: bool = True)
fastvideo.configs.pipelines.turbodiffusion.TurboDiffusionT2VConfig
dataclass
¶
TurboDiffusionT2VConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = 3.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = WanVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = WanVAEConfig(), vae_precision: str = 'fp32', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (T5Config(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp32',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (t5_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None, precision: str = 'bf16', warp_denoising_step: bool = True)
Bases: PipelineConfig
Base configuration for TurboDiffusion T2V pipeline.
Uses RCM scheduler with sigma_max=80 for 1-4 step generation. No boundary_ratio (single model, no switching).
fastvideo.configs.pipelines.turbodiffusion.TurboDiffusionT2V_14B_Config
dataclass
¶
TurboDiffusionT2V_14B_Config(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = 5.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = WanVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = WanVAEConfig(), vae_precision: str = 'fp32', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (T5Config(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp32',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (t5_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None, precision: str = 'bf16', warp_denoising_step: bool = True)
Bases: TurboDiffusionT2VConfig
Configuration for TurboDiffusion T2V 14B model.
Uses same config as 1.3B but with higher flow_shift for 14B model.
fastvideo.configs.pipelines.turbodiffusion.TurboDiffusionT2V_1_3B_Config
dataclass
¶
TurboDiffusionT2V_1_3B_Config(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = 3.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = WanVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = WanVAEConfig(), vae_precision: str = 'fp32', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (T5Config(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp32',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (t5_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None, precision: str = 'bf16', warp_denoising_step: bool = True)
fastvideo.configs.pipelines.wan
¶
Classes¶
fastvideo.configs.pipelines.wan.FastWan2_1_T2V_480P_Config
dataclass
¶
FastWan2_1_T2V_480P_Config(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = 8.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = WanVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = WanVAEConfig(), vae_precision: str = 'fp32', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (T5Config(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp32',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (t5_postprocess_text,))(), dmd_denoising_steps: list[int] | None = (lambda: [1000, 757, 522])(), ti2v_task: bool = False, boundary_ratio: float | None = None, precision: str = 'bf16', warp_denoising_step: bool = True)
fastvideo.configs.pipelines.wan.WANV2VConfig
dataclass
¶
WANV2VConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = 3.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = WanVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = WanVAEConfig(), vae_precision: str = 'fp32', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = WAN2_1ControlCLIPVisionConfig(), image_encoder_precision: str = 'bf16', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (T5Config(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp32',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (t5_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None, precision: str = 'bf16', warp_denoising_step: bool = True)
fastvideo.configs.pipelines.wan.WanI2V480PConfig
dataclass
¶
WanI2V480PConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = 3.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = WanVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = WanVAEConfig(), vae_precision: str = 'fp32', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = CLIPVisionConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (T5Config(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp32',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (t5_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None, precision: str = 'bf16', warp_denoising_step: bool = True)
fastvideo.configs.pipelines.wan.WanI2V720PConfig
dataclass
¶
WanI2V720PConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = 5.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = WanVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = WanVAEConfig(), vae_precision: str = 'fp32', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = CLIPVisionConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (T5Config(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp32',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (t5_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None, precision: str = 'bf16', warp_denoising_step: bool = True)
fastvideo.configs.pipelines.wan.WanT2V480PConfig
dataclass
¶
WanT2V480PConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = 3.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = WanVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = WanVAEConfig(), vae_precision: str = 'fp32', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (T5Config(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp32',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (t5_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None, precision: str = 'bf16', warp_denoising_step: bool = True)
fastvideo.configs.pipelines.wan.WanT2V720PConfig
dataclass
¶
WanT2V720PConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = 5.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = WanVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = WanVAEConfig(), vae_precision: str = 'fp32', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (T5Config(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp32',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (t5_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None, precision: str = 'bf16', warp_denoising_step: bool = True)