pipelines
¶
Classes¶
fastvideo.configs.pipelines.Cosmos25Config
dataclass
¶
Cosmos25Config(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 0.0, flow_shift: float = 5.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, scheduler_step_in_fp32: bool = False, is_causal: bool = False, dit_config: DiTConfig = (lambda: Cosmos25VideoConfig(arch_config=Cosmos25ArchConfig(num_attention_heads=16, attention_head_dim=128, in_channels=16, out_channels=16, num_layers=28, patch_size=[1, 2, 2], max_size=[128, 240, 240], rope_scale=[1.0, 3.0, 3.0], text_embed_dim=1024, mlp_ratio=4.0, adaln_lora_dim=256, use_adaln_lora=True, concat_padding_mask=True, extra_pos_embed_type=None, use_crossattn_projection=True, rope_enable_fps_modulation=False, qk_norm='rms_norm')))(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = Cosmos25VAEConfig(), vae_precision: str = 'bf16', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (Reason1Config(arch_config=Reason1ArchConfig(embedding_concat_strategy='full_concat')),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('bf16',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (_identity_preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (reason1_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, lucy_edit_task: bool = False, boundary_ratio: float | None = None)
fastvideo.configs.pipelines.CosmosConfig
dataclass
¶
CosmosConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: int = 6, flow_shift: float = 1.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, scheduler_step_in_fp32: bool = False, is_causal: bool = False, dit_config: DiTConfig = CosmosVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = CosmosVAEConfig(), vae_precision: str = 'fp16', vae_tiling: bool = True, vae_sp: bool = True, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (T5LargeConfig(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('bf16',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (t5_large_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, lucy_edit_task: bool = False, boundary_ratio: float | None = None, conditioning_strategy: str = 'frame_replace', min_num_conditional_frames: int = 1, max_num_conditional_frames: int = 2, sigma_conditional: float = 0.0001, sigma_data: float = 1.0, state_ch: int = 16, state_t: int = 24, text_encoder_class: str = 'T5')
fastvideo.configs.pipelines.FastHunyuanConfig
dataclass
¶
FastHunyuanConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: int = 6, flow_shift: int = 17, flow_shift_sr: float | None = None, disable_autocast: bool = False, scheduler_step_in_fp32: bool = False, is_causal: bool = False, dit_config: DiTConfig = HunyuanVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = HunyuanVAEConfig(), vae_precision: str = 'fp16', vae_tiling: bool = True, vae_sp: bool = True, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (LlamaConfig(), CLIPTextConfig()))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp16', 'fp16'))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (llama_preprocess_text, clip_preprocess_text))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], tensor], ...] = (lambda: (llama_postprocess_text, clip_postprocess_text))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, lucy_edit_task: bool = False, boundary_ratio: float | None = None)
fastvideo.configs.pipelines.HYWorldConfig
dataclass
¶
HYWorldConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: int = 5, flow_shift_sr: float | None = None, disable_autocast: bool = False, scheduler_step_in_fp32: bool = False, is_causal: bool = False, dit_config: DiTConfig = HYWorldConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = Hunyuan15VAEConfig(), vae_precision: str = 'fp16', vae_tiling: bool = True, vae_sp: bool = True, image_encoder_config: EncoderConfig = SiglipVisionConfig(), image_encoder_precision: str = 'fp16', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (Qwen2_5_VLConfig(), T5Config()))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp16', 'fp32'))(), preprocess_text_funcs: tuple[Callable[[Any], Any], ...] = (lambda: (qwen_preprocess_text, byt5_preprocess_text))(), postprocess_text_funcs: tuple[Callable[..., Any], ...] = (lambda: (qwen_postprocess_text, byt5_postprocess_text))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, lucy_edit_task: bool = False, boundary_ratio: float | None = None, text_encoder_crop_start: int = PROMPT_TEMPLATE_TOKEN_LENGTH, text_encoder_max_lengths: tuple[int, ...] = (lambda: (1000 + PROMPT_TEMPLATE_TOKEN_LENGTH, 256))())
fastvideo.configs.pipelines.Hunyuan15T2V480PConfig
dataclass
¶
Hunyuan15T2V480PConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: int = 5, flow_shift_sr: float | None = None, disable_autocast: bool = False, scheduler_step_in_fp32: bool = False, is_causal: bool = False, dit_config: DiTConfig = HunyuanVideo15Config(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = Hunyuan15VAEConfig(), vae_precision: str = 'fp16', vae_tiling: bool = True, vae_sp: bool = True, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (Qwen2_5_VLConfig(), T5Config()))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('bf16', 'fp32'))(), preprocess_text_funcs: tuple[Callable[[Any], Any], ...] = (lambda: (qwen_preprocess_text, byt5_preprocess_text))(), postprocess_text_funcs: tuple[Callable[..., Any], ...] = (lambda: (qwen_postprocess_text, byt5_postprocess_text))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, lucy_edit_task: bool = False, boundary_ratio: float | None = None, text_encoder_crop_start: int = PROMPT_TEMPLATE_TOKEN_LENGTH, text_encoder_max_lengths: tuple[int, ...] = (lambda: (1000 + PROMPT_TEMPLATE_TOKEN_LENGTH, 256))())
fastvideo.configs.pipelines.Hunyuan15T2V720PConfig
dataclass
¶
Hunyuan15T2V720PConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: int = 9, flow_shift_sr: float | None = None, disable_autocast: bool = False, scheduler_step_in_fp32: bool = False, is_causal: bool = False, dit_config: DiTConfig = HunyuanVideo15Config(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = Hunyuan15VAEConfig(), vae_precision: str = 'fp16', vae_tiling: bool = True, vae_sp: bool = True, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (Qwen2_5_VLConfig(), T5Config()))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('bf16', 'fp32'))(), preprocess_text_funcs: tuple[Callable[[Any], Any], ...] = (lambda: (qwen_preprocess_text, byt5_preprocess_text))(), postprocess_text_funcs: tuple[Callable[..., Any], ...] = (lambda: (qwen_postprocess_text, byt5_postprocess_text))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, lucy_edit_task: bool = False, boundary_ratio: float | None = None, text_encoder_crop_start: int = PROMPT_TEMPLATE_TOKEN_LENGTH, text_encoder_max_lengths: tuple[int, ...] = (lambda: (1000 + PROMPT_TEMPLATE_TOKEN_LENGTH, 256))())
fastvideo.configs.pipelines.HunyuanConfig
dataclass
¶
HunyuanConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: int = 6, flow_shift: int = 7, flow_shift_sr: float | None = None, disable_autocast: bool = False, scheduler_step_in_fp32: bool = False, is_causal: bool = False, dit_config: DiTConfig = HunyuanVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = HunyuanVAEConfig(), vae_precision: str = 'fp16', vae_tiling: bool = True, vae_sp: bool = True, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (LlamaConfig(), CLIPTextConfig()))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp16', 'fp16'))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (llama_preprocess_text, clip_preprocess_text))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], tensor], ...] = (lambda: (llama_postprocess_text, clip_postprocess_text))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, lucy_edit_task: bool = False, boundary_ratio: float | None = None)
fastvideo.configs.pipelines.HunyuanGameCraftPipelineConfig
dataclass
¶
HunyuanGameCraftPipelineConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: int = 5, flow_shift_sr: float | None = None, disable_autocast: bool = False, scheduler_step_in_fp32: bool = False, is_causal: bool = False, dit_config: DiTConfig = HunyuanGameCraftConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = GameCraftVAEConfig(), vae_precision: str = 'fp16', vae_tiling: bool = True, vae_sp: bool = True, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (LlamaConfig(), CLIPTextConfig()))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp16', 'fp16'))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (llama_preprocess_text, clip_preprocess_text))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (llama_postprocess_text, clip_postprocess_text))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, lucy_edit_task: bool = False, boundary_ratio: float | None = None)
Bases: PipelineConfig
Configuration for HunyuanGameCraft pipeline.
Inherits text encoding from HunyuanVideo but uses: - GameCraft DiT with CameraNet - Same VAE (HunyuanVAE) - Same text encoders (LLaMA + CLIP)
fastvideo.configs.pipelines.LTX2T2VConfig
dataclass
¶
LTX2T2VConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = None, flow_shift_sr: float | None = None, disable_autocast: bool = False, scheduler_step_in_fp32: bool = False, is_causal: bool = False, dit_config: DiTConfig = LTX2VideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = LTX2VAEConfig(), vae_precision: str = 'bf16', vae_tiling: bool = True, vae_sp: bool = False, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (LTX2GemmaConfig(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('bf16',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (ltx2_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, lucy_edit_task: bool = False, boundary_ratio: float | None = None, audio_decoder_config: ModelConfig = LTX2AudioDecoderConfig(), vocoder_config: ModelConfig = LTX2VocoderConfig(), audio_decoder_precision: str = 'bf16', vocoder_precision: str = 'bf16')
fastvideo.configs.pipelines.LucyEditDevConfig
dataclass
¶
LucyEditDevConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = 5.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, scheduler_step_in_fp32: bool = False, is_causal: bool = False, dit_config: DiTConfig = (lambda: WanVideoConfig(arch_config=WanVideoArchConfig(num_attention_heads=24, in_channels=96, out_channels=48, ffn_dim=14336, num_layers=30)))(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = (lambda: WanVAEConfig(arch_config=WanVAEArchConfig(base_dim=160, decoder_base_dim=256, z_dim=48, in_channels=12, out_channels=12, scale_factor_spatial=16, patch_size=2, is_residual=True, clip_output=False, latents_mean=(-0.2289, -0.0052, -0.1323, -0.2339, -0.2799, 0.0174, 0.1838, 0.1557, -0.1382, 0.0542, 0.2813, 0.0891, 0.157, -0.0098, 0.0375, -0.1825, -0.2246, -0.1207, -0.0698, 0.5109, 0.2665, -0.2108, -0.2158, 0.2502, -0.2055, -0.0322, 0.1109, 0.1567, -0.0729, 0.0899, -0.2799, -0.123, -0.0313, -0.1649, 0.0117, 0.0723, -0.2839, -0.2083, -0.052, 0.3748, 0.0152, 0.1957, 0.1433, -0.2944, 0.3573, -0.0548, -0.1681, -0.0667), latents_std=(0.4765, 1.0364, 0.4514, 1.1677, 0.5313, 0.499, 0.4818, 0.5013, 0.8158, 1.0344, 0.5894, 1.0901, 0.6885, 0.6165, 0.8454, 0.4978, 0.5759, 0.3523, 0.7135, 0.6804, 0.5833, 1.4146, 0.8986, 0.5659, 0.7069, 0.5338, 0.4889, 0.4917, 0.4069, 0.4999, 0.6866, 0.4093, 0.5709, 0.6065, 0.6415, 0.4944, 0.5726, 1.2042, 0.5458, 1.6887, 0.3971, 1.06, 0.3943, 0.5537, 0.5444, 0.4089, 0.7468, 0.7744))))(), vae_precision: str = 'fp32', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (T5Config(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp32',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (t5_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, lucy_edit_task: bool = True, boundary_ratio: float | None = None, precision: str = 'bf16', warp_denoising_step: bool = True, expand_timesteps: bool = True)
Bases: Wan2_2_TI2V_5B_Config
Configuration for Decart Lucy Edit Dev video editing.
fastvideo.configs.pipelines.PipelineConfig
dataclass
¶
PipelineConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = None, flow_shift_sr: float | None = None, disable_autocast: bool = False, scheduler_step_in_fp32: bool = False, is_causal: bool = False, dit_config: DiTConfig = DiTConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = VAEConfig(), vae_precision: str = 'fp32', vae_tiling: bool = True, vae_sp: bool = True, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (EncoderConfig(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp32',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], tensor], ...] = (lambda: (postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, lucy_edit_task: bool = False, boundary_ratio: float | None = None)
Base configuration for all pipeline architectures.
Methods:¶
fastvideo.configs.pipelines.PipelineConfig.from_kwargs
classmethod
¶
from_kwargs(kwargs: dict[str, Any], config_cli_prefix: str = '') -> PipelineConfig
Load PipelineConfig from kwargs Dictionary. kwargs: dictionary of kwargs config_cli_prefix: prefix of CLI arguments for this PipelineConfig instance
Source code in fastvideo/configs/pipelines/base.py
fastvideo.configs.pipelines.PipelineConfig.from_pretrained
classmethod
¶
from_pretrained(model_path: str) -> PipelineConfig
use the pipeline class setting from model_path to match the pipeline config
Source code in fastvideo/configs/pipelines/base.py
fastvideo.configs.pipelines.WanI2V480PConfig
dataclass
¶
WanI2V480PConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = 3.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, scheduler_step_in_fp32: bool = False, is_causal: bool = False, dit_config: DiTConfig = WanVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = WanVAEConfig(), vae_precision: str = 'fp32', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = CLIPVisionConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (T5Config(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp32',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (t5_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, lucy_edit_task: bool = False, boundary_ratio: float | None = None, precision: str = 'bf16', warp_denoising_step: bool = True)
fastvideo.configs.pipelines.WanI2V720PConfig
dataclass
¶
WanI2V720PConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = 5.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, scheduler_step_in_fp32: bool = False, is_causal: bool = False, dit_config: DiTConfig = WanVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = WanVAEConfig(), vae_precision: str = 'fp32', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = CLIPVisionConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (T5Config(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp32',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (t5_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, lucy_edit_task: bool = False, boundary_ratio: float | None = None, precision: str = 'bf16', warp_denoising_step: bool = True)
fastvideo.configs.pipelines.WanT2V480PConfig
dataclass
¶
WanT2V480PConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = 3.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, scheduler_step_in_fp32: bool = False, is_causal: bool = False, dit_config: DiTConfig = WanVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = WanVAEConfig(), vae_precision: str = 'fp32', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (T5Config(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp32',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (t5_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, lucy_edit_task: bool = False, boundary_ratio: float | None = None, precision: str = 'bf16', warp_denoising_step: bool = True)
fastvideo.configs.pipelines.WanT2V720PConfig
dataclass
¶
WanT2V720PConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = 5.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, scheduler_step_in_fp32: bool = False, is_causal: bool = False, dit_config: DiTConfig = WanVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = WanVAEConfig(), vae_precision: str = 'fp32', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (T5Config(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp32',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (t5_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, lucy_edit_task: bool = False, boundary_ratio: float | None = None, precision: str = 'bf16', warp_denoising_step: bool = True)
Modules¶
fastvideo.configs.pipelines.base
¶
Classes¶
fastvideo.configs.pipelines.base.PipelineConfig
dataclass
¶
PipelineConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = None, flow_shift_sr: float | None = None, disable_autocast: bool = False, scheduler_step_in_fp32: bool = False, is_causal: bool = False, dit_config: DiTConfig = DiTConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = VAEConfig(), vae_precision: str = 'fp32', vae_tiling: bool = True, vae_sp: bool = True, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (EncoderConfig(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp32',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], tensor], ...] = (lambda: (postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, lucy_edit_task: bool = False, boundary_ratio: float | None = None)
Base configuration for all pipeline architectures.
Methods:¶
fastvideo.configs.pipelines.base.PipelineConfig.from_kwargs
classmethod
¶from_kwargs(kwargs: dict[str, Any], config_cli_prefix: str = '') -> PipelineConfig
Load PipelineConfig from kwargs Dictionary. kwargs: dictionary of kwargs config_cli_prefix: prefix of CLI arguments for this PipelineConfig instance
Source code in fastvideo/configs/pipelines/base.py
fastvideo.configs.pipelines.base.PipelineConfig.from_pretrained
classmethod
¶from_pretrained(model_path: str) -> PipelineConfig
use the pipeline class setting from model_path to match the pipeline config
Source code in fastvideo/configs/pipelines/base.py
Functions:¶
fastvideo.configs.pipelines.base.parse_int_list
¶
Parse a comma-separated string of integers into a list.
fastvideo.configs.pipelines.cosmos
¶
Classes¶
fastvideo.configs.pipelines.cosmos.CosmosConfig
dataclass
¶
CosmosConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: int = 6, flow_shift: float = 1.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, scheduler_step_in_fp32: bool = False, is_causal: bool = False, dit_config: DiTConfig = CosmosVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = CosmosVAEConfig(), vae_precision: str = 'fp16', vae_tiling: bool = True, vae_sp: bool = True, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (T5LargeConfig(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('bf16',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (t5_large_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, lucy_edit_task: bool = False, boundary_ratio: float | None = None, conditioning_strategy: str = 'frame_replace', min_num_conditional_frames: int = 1, max_num_conditional_frames: int = 2, sigma_conditional: float = 0.0001, sigma_data: float = 1.0, state_ch: int = 16, state_t: int = 24, text_encoder_class: str = 'T5')
Functions:¶
fastvideo.configs.pipelines.cosmos.t5_large_postprocess_text
¶
Postprocess T5 Large text encoder outputs for Cosmos pipeline.
Return raw last_hidden_state without truncation/padding.
Source code in fastvideo/configs/pipelines/cosmos.py
fastvideo.configs.pipelines.cosmos2_5
¶
Classes¶
fastvideo.configs.pipelines.cosmos2_5.Cosmos25Config
dataclass
¶
Cosmos25Config(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 0.0, flow_shift: float = 5.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, scheduler_step_in_fp32: bool = False, is_causal: bool = False, dit_config: DiTConfig = (lambda: Cosmos25VideoConfig(arch_config=Cosmos25ArchConfig(num_attention_heads=16, attention_head_dim=128, in_channels=16, out_channels=16, num_layers=28, patch_size=[1, 2, 2], max_size=[128, 240, 240], rope_scale=[1.0, 3.0, 3.0], text_embed_dim=1024, mlp_ratio=4.0, adaln_lora_dim=256, use_adaln_lora=True, concat_padding_mask=True, extra_pos_embed_type=None, use_crossattn_projection=True, rope_enable_fps_modulation=False, qk_norm='rms_norm')))(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = Cosmos25VAEConfig(), vae_precision: str = 'bf16', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (Reason1Config(arch_config=Reason1ArchConfig(embedding_concat_strategy='full_concat')),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('bf16',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (_identity_preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (reason1_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, lucy_edit_task: bool = False, boundary_ratio: float | None = None)
fastvideo.configs.pipelines.cosmos2_5.Cosmos25_14BConfig
dataclass
¶
Cosmos25_14BConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 0.0, flow_shift: float = 5.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, scheduler_step_in_fp32: bool = False, is_causal: bool = False, dit_config: DiTConfig = (lambda: Cosmos25_14BVideoConfig(arch_config=Cosmos25_14BArchConfig(num_attention_heads=40, attention_head_dim=128, in_channels=16, out_channels=16, num_layers=36, patch_size=[1, 2, 2], max_size=[128, 240, 240], rope_scale=[1.0, 3.0, 3.0], text_embed_dim=1024, mlp_ratio=4.0, adaln_lora_dim=256, use_adaln_lora=True, concat_padding_mask=True, extra_pos_embed_type=None, use_crossattn_projection=True, rope_enable_fps_modulation=False, qk_norm='rms_norm')))(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = Cosmos25VAEConfig(), vae_precision: str = 'bf16', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (Reason1Config(arch_config=Reason1ArchConfig(embedding_concat_strategy='full_concat')),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('bf16',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (_identity_preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (reason1_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, lucy_edit_task: bool = False, boundary_ratio: float | None = None)
fastvideo.configs.pipelines.flux_2
¶
Classes¶
fastvideo.configs.pipelines.flux_2.Flux2KleinEncoderArchConfig
dataclass
¶
Flux2KleinEncoderArchConfig(stacked_params_mapping: list[tuple[str, str, str]] = list(), architectures: list[str] = (lambda: [])(), _supported_attention_backends: tuple[AttentionBackendEnum, ...] = (FLASH_ATTN, TORCH_SDPA), output_hidden_states: bool = True, use_return_dict: bool = True)
Bases: EncoderArchConfig
Encoder arch config for Flux2 Klein (Qwen3); needs hidden states for layers 9, 18, 27.
fastvideo.configs.pipelines.flux_2.Flux2KleinPipelineConfig
dataclass
¶
Flux2KleinPipelineConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float | None = None, flow_shift: float | None = None, flow_shift_sr: float | None = None, disable_autocast: bool = False, scheduler_step_in_fp32: bool = True, is_causal: bool = False, dit_config: DiTConfig = Flux2Config(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = Flux2VAEConfig(), vae_precision: str = 'fp32', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (Qwen3TextConfig(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('bf16',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (flux2_klein_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, lucy_edit_task: bool = False, boundary_ratio: float | None = None, flux2_text_encoder_type: str = 'qwen3', text_encoder_out_layers: tuple[int, ...] = (9, 18, 27))
fastvideo.configs.pipelines.flux_2.Flux2KleinTextEncoderConfig
dataclass
¶
Flux2KleinTextEncoderConfig(arch_config: EncoderArchConfig = Flux2KleinEncoderArchConfig(), prefix: str = '', quant_config: QuantizationConfig | None = None, lora_config: Any | None = None)
Bases: EncoderConfig
Text encoder config for Flux2 Klein (Qwen3).
fastvideo.configs.pipelines.flux_2.Flux2PipelineConfig
dataclass
¶
Flux2PipelineConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float | None = 4.0, flow_shift: float | None = None, flow_shift_sr: float | None = None, disable_autocast: bool = False, scheduler_step_in_fp32: bool = True, is_causal: bool = False, dit_config: DiTConfig = Flux2Config(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = Flux2VAEConfig(), vae_precision: str = 'fp32', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (Mistral3TextConfig(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('bf16',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (default_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, lucy_edit_task: bool = False, boundary_ratio: float | None = None, flux2_text_encoder_type: str = 'mistral3', text_encoder_out_layers: tuple[int, ...] = (10, 20, 30))
Bases: PipelineConfig
Configuration for Flux2 image generation pipeline.
Functions:¶
fastvideo.configs.pipelines.flux_2.flux2_klein_postprocess_text
¶
Klein postprocess: hidden states from layers 9, 18, 27 (Qwen3).
Source code in fastvideo/configs/pipelines/flux_2.py
fastvideo.configs.pipelines.gen3c
¶
Classes¶
fastvideo.configs.pipelines.gen3c.Gen3CConfig
dataclass
¶
Gen3CConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: int = 6, flow_shift: float = 1.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, scheduler_step_in_fp32: bool = False, is_causal: bool = False, dit_config: DiTConfig = Gen3CVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = Gen3CVAEConfig(), vae_precision: str = 'bf16', vae_tiling: bool = True, vae_sp: bool = True, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (_Gen3CT5LargeConfig(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('bf16',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (t5_large_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, lucy_edit_task: bool = False, boundary_ratio: float | None = None, conditioning_strategy: str = 'frame_replace', min_num_conditional_frames: int = 1, max_num_conditional_frames: int = 2, sigma_conditional: float = 0.001, sigma_data: float = 0.5, state_ch: int = 16, state_t: int = 16, text_encoder_class: str = 'T5', frame_buffer_max: int = 2, noise_aug_strength: float = 0.0, filter_points_threshold: float = 0.05, use_moge_depth: bool = True, moge_model_name: str = 'Ruicheng/moge-vitl', offload_moge_after_depth: bool = True, default_trajectory_type: str = 'left', default_movement_distance: float = 0.3, default_camera_rotation: str = 'center_facing', video_resolution: tuple[int, int] = (704, 1280), num_frames: int = 121, fps: int = 24, cfg_behavior: str = 'legacy', default_negative_prompt: str = 'The video captures a series of frames showing ugly scenes, static with no motion, motion blur, over-saturation, shaky footage, low resolution, grainy texture, pixelated images, poorly lit areas, underexposed and overexposed scenes, poor color balance, washed out colors, choppy sequences, jerky movements, low frame rate, artifacting, color banding, unnatural transitions, outdated special effects, fake elements, unconvincing visuals, poorly edited content, jump cuts, visual noise, and flickering. Overall, the video is of poor quality.', autoregressive_chunk_frames: int = 121, autoregressive_overlap_frames: int = 1)
Bases: PipelineConfig
Configuration for GEN3C Video Generation Pipeline.
GEN3C extends Cosmos with 3D cache for camera-controlled video generation. Key parameters: - frame_buffer_max: Number of 3D cache buffers (default: 2) - noise_aug_strength: Strength of noise augmentation per buffer - filter_points_threshold: Threshold for filtering unreliable depth points
fastvideo.configs.pipelines.gen3c.Gen3CInferenceConfig
dataclass
¶
Gen3CInferenceConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: int = 6, flow_shift: float = 1.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, scheduler_step_in_fp32: bool = False, is_causal: bool = False, dit_config: DiTConfig = Gen3CVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = Gen3CVAEConfig(), vae_precision: str = 'bf16', vae_tiling: bool = True, vae_sp: bool = True, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (_Gen3CT5LargeConfig(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('bf16',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (t5_large_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, lucy_edit_task: bool = False, boundary_ratio: float | None = None, conditioning_strategy: str = 'frame_replace', min_num_conditional_frames: int = 1, max_num_conditional_frames: int = 2, sigma_conditional: float = 0.001, sigma_data: float = 0.5, state_ch: int = 16, state_t: int = 16, text_encoder_class: str = 'T5', frame_buffer_max: int = 2, noise_aug_strength: float = 0.0, filter_points_threshold: float = 0.05, use_moge_depth: bool = True, moge_model_name: str = 'Ruicheng/moge-vitl', offload_moge_after_depth: bool = True, default_trajectory_type: str = 'left', default_movement_distance: float = 0.3, default_camera_rotation: str = 'center_facing', video_resolution: tuple[int, int] = (704, 1280), num_frames: int = 121, fps: int = 24, cfg_behavior: str = 'legacy', default_negative_prompt: str = 'The video captures a series of frames showing ugly scenes, static with no motion, motion blur, over-saturation, shaky footage, low resolution, grainy texture, pixelated images, poorly lit areas, underexposed and overexposed scenes, poor color balance, washed out colors, choppy sequences, jerky movements, low frame rate, artifacting, color banding, unnatural transitions, outdated special effects, fake elements, unconvincing visuals, poorly edited content, jump cuts, visual noise, and flickering. Overall, the video is of poor quality.', autoregressive_chunk_frames: int = 121, autoregressive_overlap_frames: int = 1, batch_size: int = 1, gradient_checkpointing: bool = False, guidance_scale: float = 1.0, num_inference_steps: int = 35)
Functions:¶
fastvideo.configs.pipelines.gen3c.t5_large_postprocess_text
¶
Postprocess T5 Large text encoder outputs for GEN3C pipeline.
Return raw last_hidden_state without truncation/padding.
Source code in fastvideo/configs/pipelines/gen3c.py
fastvideo.configs.pipelines.hunyuan
¶
Classes¶
fastvideo.configs.pipelines.hunyuan.FastHunyuanConfig
dataclass
¶
FastHunyuanConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: int = 6, flow_shift: int = 17, flow_shift_sr: float | None = None, disable_autocast: bool = False, scheduler_step_in_fp32: bool = False, is_causal: bool = False, dit_config: DiTConfig = HunyuanVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = HunyuanVAEConfig(), vae_precision: str = 'fp16', vae_tiling: bool = True, vae_sp: bool = True, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (LlamaConfig(), CLIPTextConfig()))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp16', 'fp16'))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (llama_preprocess_text, clip_preprocess_text))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], tensor], ...] = (lambda: (llama_postprocess_text, clip_postprocess_text))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, lucy_edit_task: bool = False, boundary_ratio: float | None = None)
fastvideo.configs.pipelines.hunyuan.HunyuanConfig
dataclass
¶
HunyuanConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: int = 6, flow_shift: int = 7, flow_shift_sr: float | None = None, disable_autocast: bool = False, scheduler_step_in_fp32: bool = False, is_causal: bool = False, dit_config: DiTConfig = HunyuanVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = HunyuanVAEConfig(), vae_precision: str = 'fp16', vae_tiling: bool = True, vae_sp: bool = True, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (LlamaConfig(), CLIPTextConfig()))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp16', 'fp16'))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (llama_preprocess_text, clip_preprocess_text))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], tensor], ...] = (lambda: (llama_postprocess_text, clip_postprocess_text))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, lucy_edit_task: bool = False, boundary_ratio: float | None = None)
fastvideo.configs.pipelines.hunyuan15
¶
Classes¶
fastvideo.configs.pipelines.hunyuan15.Hunyuan15I2V720PConfig
dataclass
¶
Hunyuan15I2V720PConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: int = 7, flow_shift_sr: float | None = None, disable_autocast: bool = False, scheduler_step_in_fp32: bool = False, is_causal: bool = False, dit_config: DiTConfig = HunyuanVideo15Config(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = Hunyuan15VAEConfig(), vae_precision: str = 'fp16', vae_tiling: bool = True, vae_sp: bool = True, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (Qwen2_5_VLConfig(), T5Config()))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('bf16', 'fp32'))(), preprocess_text_funcs: tuple[Callable[[Any], Any], ...] = (lambda: (qwen_preprocess_text, byt5_preprocess_text))(), postprocess_text_funcs: tuple[Callable[..., Any], ...] = (lambda: (qwen_postprocess_text, byt5_postprocess_text))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, lucy_edit_task: bool = False, boundary_ratio: float | None = None, text_encoder_crop_start: int = PROMPT_TEMPLATE_TOKEN_LENGTH, text_encoder_max_lengths: tuple[int, ...] = (lambda: (1000 + PROMPT_TEMPLATE_TOKEN_LENGTH, 256))())
fastvideo.configs.pipelines.hunyuan15.Hunyuan15SR1080PConfig
dataclass
¶
Hunyuan15SR1080PConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: int = 7, flow_shift_sr: int = 2, disable_autocast: bool = False, scheduler_step_in_fp32: bool = False, is_causal: bool = False, dit_config: DiTConfig = HunyuanVideo15Config(), dit_precision: str = 'bf16', upsampler_config: tuple[UpsamplerConfig, ...] = (lambda: (SRTo720pUpsamplerConfig(), SRTo1080pUpsamplerConfig()))(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = Hunyuan15VAEConfig(), vae_precision: str = 'fp16', vae_tiling: bool = True, vae_sp: bool = True, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (Qwen2_5_VLConfig(), T5Config()))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('bf16', 'fp32'))(), preprocess_text_funcs: tuple[Callable[[Any], Any], ...] = (lambda: (qwen_preprocess_text, byt5_preprocess_text))(), postprocess_text_funcs: tuple[Callable[..., Any], ...] = (lambda: (qwen_postprocess_text, byt5_postprocess_text))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, lucy_edit_task: bool = False, boundary_ratio: float | None = None, text_encoder_crop_start: int = PROMPT_TEMPLATE_TOKEN_LENGTH, text_encoder_max_lengths: tuple[int, ...] = (lambda: (1000 + PROMPT_TEMPLATE_TOKEN_LENGTH, 256))())
fastvideo.configs.pipelines.hunyuan15.Hunyuan15T2V480PConfig
dataclass
¶
Hunyuan15T2V480PConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: int = 5, flow_shift_sr: float | None = None, disable_autocast: bool = False, scheduler_step_in_fp32: bool = False, is_causal: bool = False, dit_config: DiTConfig = HunyuanVideo15Config(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = Hunyuan15VAEConfig(), vae_precision: str = 'fp16', vae_tiling: bool = True, vae_sp: bool = True, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (Qwen2_5_VLConfig(), T5Config()))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('bf16', 'fp32'))(), preprocess_text_funcs: tuple[Callable[[Any], Any], ...] = (lambda: (qwen_preprocess_text, byt5_preprocess_text))(), postprocess_text_funcs: tuple[Callable[..., Any], ...] = (lambda: (qwen_postprocess_text, byt5_postprocess_text))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, lucy_edit_task: bool = False, boundary_ratio: float | None = None, text_encoder_crop_start: int = PROMPT_TEMPLATE_TOKEN_LENGTH, text_encoder_max_lengths: tuple[int, ...] = (lambda: (1000 + PROMPT_TEMPLATE_TOKEN_LENGTH, 256))())
fastvideo.configs.pipelines.hunyuan15.Hunyuan15T2V720PConfig
dataclass
¶
Hunyuan15T2V720PConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: int = 9, flow_shift_sr: float | None = None, disable_autocast: bool = False, scheduler_step_in_fp32: bool = False, is_causal: bool = False, dit_config: DiTConfig = HunyuanVideo15Config(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = Hunyuan15VAEConfig(), vae_precision: str = 'fp16', vae_tiling: bool = True, vae_sp: bool = True, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (Qwen2_5_VLConfig(), T5Config()))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('bf16', 'fp32'))(), preprocess_text_funcs: tuple[Callable[[Any], Any], ...] = (lambda: (qwen_preprocess_text, byt5_preprocess_text))(), postprocess_text_funcs: tuple[Callable[..., Any], ...] = (lambda: (qwen_postprocess_text, byt5_postprocess_text))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, lucy_edit_task: bool = False, boundary_ratio: float | None = None, text_encoder_crop_start: int = PROMPT_TEMPLATE_TOKEN_LENGTH, text_encoder_max_lengths: tuple[int, ...] = (lambda: (1000 + PROMPT_TEMPLATE_TOKEN_LENGTH, 256))())
Functions:¶
fastvideo.configs.pipelines.hunyuan15.extract_glyph_texts
¶
Extract glyph texts from prompt using regex pattern.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
prompt
|
str
|
Input prompt string |
required |
Returns:
| Type | Description |
|---|---|
str | None
|
List of extracted glyph texts |
Source code in fastvideo/configs/pipelines/hunyuan15.py
fastvideo.configs.pipelines.hunyuan15.format_text_input
¶
Apply text to template.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
prompt
|
List[str]
|
Input text. |
required |
system_message
|
str
|
System message. |
required |
Returns:
| Type | Description |
|---|---|
list[dict[str, Any]]
|
List[Dict[str, Any]]: List of chat conversation. |
Source code in fastvideo/configs/pipelines/hunyuan15.py
fastvideo.configs.pipelines.hunyuangamecraft
¶
Pipeline configuration for HunyuanGameCraft.
HunyuanGameCraft extends HunyuanVideo with: 1. CameraNet for camera/action conditioning (Plücker coordinates) 2. Mask-based conditioning for autoregressive generation 3. 33 input channels (16 latent + 16 gt_latent + 1 mask)
Text encoders are the same as HunyuanVideo: - LLaVA-LLaMA-3-8B for primary text encoding (4096 dim) - CLIP ViT-L/14 for secondary pooled embeddings (768 dim)
Classes¶
fastvideo.configs.pipelines.hunyuangamecraft.HunyuanGameCraftPipelineConfig
dataclass
¶
HunyuanGameCraftPipelineConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: int = 5, flow_shift_sr: float | None = None, disable_autocast: bool = False, scheduler_step_in_fp32: bool = False, is_causal: bool = False, dit_config: DiTConfig = HunyuanGameCraftConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = GameCraftVAEConfig(), vae_precision: str = 'fp16', vae_tiling: bool = True, vae_sp: bool = True, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (LlamaConfig(), CLIPTextConfig()))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp16', 'fp16'))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (llama_preprocess_text, clip_preprocess_text))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (llama_postprocess_text, clip_postprocess_text))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, lucy_edit_task: bool = False, boundary_ratio: float | None = None)
Bases: PipelineConfig
Configuration for HunyuanGameCraft pipeline.
Inherits text encoding from HunyuanVideo but uses: - GameCraft DiT with CameraNet - Same VAE (HunyuanVAE) - Same text encoders (LLaMA + CLIP)
fastvideo.configs.pipelines.hyworld
¶
Classes¶
fastvideo.configs.pipelines.hyworld.HYWorldConfig
dataclass
¶
HYWorldConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: int = 5, flow_shift_sr: float | None = None, disable_autocast: bool = False, scheduler_step_in_fp32: bool = False, is_causal: bool = False, dit_config: DiTConfig = HYWorldConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = Hunyuan15VAEConfig(), vae_precision: str = 'fp16', vae_tiling: bool = True, vae_sp: bool = True, image_encoder_config: EncoderConfig = SiglipVisionConfig(), image_encoder_precision: str = 'fp16', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (Qwen2_5_VLConfig(), T5Config()))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp16', 'fp32'))(), preprocess_text_funcs: tuple[Callable[[Any], Any], ...] = (lambda: (qwen_preprocess_text, byt5_preprocess_text))(), postprocess_text_funcs: tuple[Callable[..., Any], ...] = (lambda: (qwen_postprocess_text, byt5_postprocess_text))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, lucy_edit_task: bool = False, boundary_ratio: float | None = None, text_encoder_crop_start: int = PROMPT_TEMPLATE_TOKEN_LENGTH, text_encoder_max_lengths: tuple[int, ...] = (lambda: (1000 + PROMPT_TEMPLATE_TOKEN_LENGTH, 256))())
fastvideo.configs.pipelines.longcat
¶
Classes¶
fastvideo.configs.pipelines.longcat.LongCatDiTArchConfig
dataclass
¶
LongCatDiTArchConfig(stacked_params_mapping: list[tuple[str, str, str]] = list(), _fsdp_shard_conditions: list = list(), _compile_conditions: list = list(), param_names_mapping: dict = dict(), reverse_param_names_mapping: dict = dict(), lora_param_names_mapping: dict = dict(), cast_prompt_embeds_to_dit_dtype: bool = False, _supported_attention_backends: tuple[AttentionBackendEnum, ...] = (SAGE_ATTN, FLASH_ATTN, TORCH_SDPA, VIDEO_SPARSE_ATTN, VMOBA_ATTN, SAGE_ATTN_THREE, SLA_ATTN, SAGE_SLA_ATTN), hidden_size: int = 0, num_attention_heads: int = 0, num_channels_latents: int = 0, in_channels: int = 16, out_channels: int = 16, exclude_lora_layers: list[str] = list(), boundary_ratio: float | None = None, adaln_tembed_dim: int = 512, caption_channels: int = 4096, depth: int = 48, enable_bsa: bool = False, enable_flashattn3: bool = False, enable_flashattn2: bool = True, enable_xformers: bool = False, frequency_embedding_size: int = 256, mlp_ratio: int = 4, num_heads: int = 32, text_tokens_zero_pad: bool = True, patch_size: list[int] = (lambda: [1, 2, 2])(), cp_split_hw: list[int] | None = None, bsa_params: dict | None = None)
Bases: DiTArchConfig
Extended DiTArchConfig with LongCat-specific fields.
fastvideo.configs.pipelines.longcat.LongCatT2V480PConfig
dataclass
¶
LongCatT2V480PConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = None, flow_shift_sr: float | None = None, disable_autocast: bool = False, scheduler_step_in_fp32: bool = False, is_causal: bool = False, dit_config: DiTConfig = (lambda: DiTConfig(arch_config=LongCatDiTArchConfig()))(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = WanVAEConfig(), vae_precision: str = 'bf16', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[T5Config, ...] = (lambda: (LongCatT5Config(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('bf16',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (longcat_preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (umt5_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, lucy_edit_task: bool = False, boundary_ratio: float | None = None, enable_kv_cache: bool = True, offload_kv_cache: bool = False, enable_bsa: bool = False, use_distill: bool = False, enhance_hf: bool = False, bsa_params: dict | None = None, bsa_sparsity: float | None = None, bsa_cdf_threshold: float | None = None, bsa_chunk_q: list[int] | None = None, bsa_chunk_k: list[int] | None = None, t_thresh: float | None = None)
Bases: PipelineConfig
Configuration for LongCat pipeline (480p).
Components expected by loaders
- tokenizer: AutoTokenizer
- text_encoder: UMT5EncoderModel
- transformer: LongCatTransformer3DModel
- vae: AutoencoderKLWan (Wan VAE, 4x8 compression)
- scheduler: FlowMatchEulerDiscreteScheduler
fastvideo.configs.pipelines.longcat.LongCatT2V704PConfig
dataclass
¶
LongCatT2V704PConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = None, flow_shift_sr: float | None = None, disable_autocast: bool = False, scheduler_step_in_fp32: bool = False, is_causal: bool = False, dit_config: DiTConfig = (lambda: DiTConfig(arch_config=LongCatDiTArchConfig()))(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = WanVAEConfig(), vae_precision: str = 'bf16', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[T5Config, ...] = (lambda: (LongCatT5Config(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('bf16',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (longcat_preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (umt5_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, lucy_edit_task: bool = False, boundary_ratio: float | None = None, enable_kv_cache: bool = True, offload_kv_cache: bool = False, enable_bsa: bool = True, use_distill: bool = False, enhance_hf: bool = False, bsa_params: dict | None = None, bsa_sparsity: float | None = None, bsa_cdf_threshold: float | None = None, bsa_chunk_q: list[int] | None = None, bsa_chunk_k: list[int] | None = None, t_thresh: float | None = None)
Bases: LongCatT2V480PConfig
Configuration for LongCat pipeline (704p) with BSA enabled by default.
Uses the same resolution and BSA parameters as original LongCat refinement stage.
BSA parameters configured in transformer config.json with chunk_3d_shape=[4,4,4]:
- Input: 704×1280×96
- VAE (8x): 88×160×96
- Patch [1,2,2]: 44×80×96
- chunk [4,4,4]: 96%4=0, 44%4=0, 80%4=0 ✅
This configuration matches the original LongCat refinement stage parameters.
fastvideo.configs.pipelines.longcat.LongCatT5ArchConfig
dataclass
¶
LongCatT5ArchConfig(stacked_params_mapping: list[tuple[str, str, str]] = (lambda: [('.qkv_proj', '.q', 'q'), ('.qkv_proj', '.k', 'k'), ('.qkv_proj', '.v', 'v')])(), architectures: list[str] = (lambda: [])(), _supported_attention_backends: tuple[AttentionBackendEnum, ...] = (FLASH_ATTN, TORCH_SDPA), output_hidden_states: bool = False, use_return_dict: bool = True, vocab_size: int = 32128, hidden_size: int = 0, num_hidden_layers: int = 0, num_attention_heads: int = 0, pad_token_id: int = 0, eos_token_id: int = 1, text_len: int = 512, hidden_state_skip_layer: int = 0, decoder_start_token_id: int = 0, output_past: bool = True, scalable_attention: bool = True, tie_word_embeddings: bool = False, tokenizer_kwargs: dict[str, Any] = dict(), _fsdp_shard_conditions: list = (lambda: [_is_transformer_layer, _is_embeddings, _is_final_layernorm])(), require_processor: bool = False, d_model: int = 512, d_kv: int = 64, d_ff: int = 2048, num_layers: int = 6, num_decoder_layers: int | None = None, num_heads: int = 8, relative_attention_num_buckets: int = 32, relative_attention_max_distance: int = 128, dropout_rate: float = 0.1, layer_norm_epsilon: float = 1e-06, initializer_factor: float = 1.0, feed_forward_proj: str = 'relu', dense_act_fn: str = '', is_gated_act: bool = False, is_encoder_decoder: bool = True, use_cache: bool = True, classifier_dropout: float = 0.0, dtype: str | None = None, gradient_checkpointing: bool = False, n_positions: int = 512, task_specific_params: dict | None = None)
Bases: T5ArchConfig
T5 arch that pads tokenizer output to max_length.
LongCat's denoising stage concatenates positive and negative
attention masks along the batch dimension for CFG, which requires
uniform seq length. The shared :class:T5ArchConfig dropped the
"padding": "max_length" tokenizer kwarg so other DiTs could run
with variable-length masks; LongCat still needs the uniform
contract.
Functions:¶
fastvideo.configs.pipelines.longcat.longcat_preprocess_text
¶
Clean and preprocess text like original LongCat implementation.
This function applies the same text cleaning pipeline as the original LongCat-Video implementation to ensure identical tokenization results.
Steps: 1. basic_clean: Fix unicode issues and unescape HTML entities 2. whitespace_clean: Normalize whitespace to single spaces
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
prompt
|
str
|
Raw input text prompt |
required |
Returns:
| Type | Description |
|---|---|
str
|
Cleaned and normalized text prompt |
Source code in fastvideo/configs/pipelines/longcat.py
fastvideo.configs.pipelines.longcat.umt5_postprocess_text
¶
Postprocess UMT5/T5 encoder outputs to fixed length 512 embeddings.
Source code in fastvideo/configs/pipelines/longcat.py
fastvideo.configs.pipelines.stable_audio
¶
PipelineConfig for Stable Audio Open 1.0.
Classes¶
fastvideo.configs.pipelines.stable_audio.StableAudioOpenSmallConfig
dataclass
¶
StableAudioOpenSmallConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = None, flow_shift_sr: float | None = None, disable_autocast: bool = False, scheduler_step_in_fp32: bool = False, is_causal: bool = False, dit_config: DiTConfig = StableAudioConfig(), dit_precision: str = 'fp16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = OobleckVAEConfig(), vae_precision: str = 'fp16', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple = tuple(), text_encoder_precisions: tuple[str, ...] = tuple(), preprocess_text_funcs: tuple = tuple(), postprocess_text_funcs: tuple = tuple(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, lucy_edit_task: bool = False, boundary_ratio: float | None = None, num_inference_steps: int = 100, guidance_scale: float = 7.0, audio_end_in_s: float = 6.0, audio_start_in_s: float = 0.0, sampling_rate: int = 44100, audio_channels: int = 2, sample_size: int = 524288, max_audio_duration_s: float = 524288 / 44100, precision: str = 'fp16')
Bases: StableAudioT2AConfig
stable-audio-open-small overrides: shorter training window
(524288 samples ≈ 11.89s @ 44.1 kHz) and a faster default sampler
config carried by the small preset.
fastvideo.configs.pipelines.stable_audio.StableAudioT2AConfig
dataclass
¶
StableAudioT2AConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = None, flow_shift_sr: float | None = None, disable_autocast: bool = False, scheduler_step_in_fp32: bool = False, is_causal: bool = False, dit_config: DiTConfig = StableAudioConfig(), dit_precision: str = 'fp16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = OobleckVAEConfig(), vae_precision: str = 'fp16', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple = tuple(), text_encoder_precisions: tuple[str, ...] = tuple(), preprocess_text_funcs: tuple = tuple(), postprocess_text_funcs: tuple = tuple(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, lucy_edit_task: bool = False, boundary_ratio: float | None = None, num_inference_steps: int = 100, guidance_scale: float = 7.0, audio_end_in_s: float = 10.0, audio_start_in_s: float = 0.0, sampling_rate: int = 44100, audio_channels: int = 2, sample_size: int = 2097152, max_audio_duration_s: float = 2097152 / 44100, precision: str = 'fp16')
fastvideo.configs.pipelines.turbodiffusion
¶
TurboDiffusion pipeline configurations.
TurboDiffusion uses RCM (recurrent Consistency Model) scheduler with SLA (Sparse-Linear Attention) for fast 1-4 step video generation.
Classes¶
fastvideo.configs.pipelines.turbodiffusion.TurboDiffusionI2VConfig
dataclass
¶
TurboDiffusionI2VConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = 5.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, scheduler_step_in_fp32: bool = False, is_causal: bool = False, dit_config: DiTConfig = WanVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = WanVAEConfig(), vae_precision: str = 'fp32', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = CLIPVisionConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (T5Config(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp32',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (t5_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, lucy_edit_task: bool = False, boundary_ratio: float | None = 0.9, precision: str = 'bf16', warp_denoising_step: bool = True)
Bases: PipelineConfig
Base configuration for TurboDiffusion I2V pipeline.
Uses RCM scheduler with sigma_max=200 for 1-4 step generation. Uses boundary_ratio=0.9 for high-noise to low-noise model switching.
fastvideo.configs.pipelines.turbodiffusion.TurboDiffusionI2V_A14B_Config
dataclass
¶
TurboDiffusionI2V_A14B_Config(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = 5.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, scheduler_step_in_fp32: bool = False, is_causal: bool = False, dit_config: DiTConfig = WanVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = WanVAEConfig(), vae_precision: str = 'fp32', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = CLIPVisionConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (T5Config(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp32',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (t5_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, lucy_edit_task: bool = False, boundary_ratio: float | None = 0.9, precision: str = 'bf16', warp_denoising_step: bool = True)
fastvideo.configs.pipelines.turbodiffusion.TurboDiffusionT2VConfig
dataclass
¶
TurboDiffusionT2VConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = 3.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, scheduler_step_in_fp32: bool = False, is_causal: bool = False, dit_config: DiTConfig = WanVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = WanVAEConfig(), vae_precision: str = 'fp32', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (T5Config(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp32',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (t5_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, lucy_edit_task: bool = False, boundary_ratio: float | None = None, precision: str = 'bf16', warp_denoising_step: bool = True)
Bases: PipelineConfig
Base configuration for TurboDiffusion T2V pipeline.
Uses RCM scheduler with sigma_max=80 for 1-4 step generation. No boundary_ratio (single model, no switching).
fastvideo.configs.pipelines.turbodiffusion.TurboDiffusionT2V_14B_Config
dataclass
¶
TurboDiffusionT2V_14B_Config(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = 5.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, scheduler_step_in_fp32: bool = False, is_causal: bool = False, dit_config: DiTConfig = WanVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = WanVAEConfig(), vae_precision: str = 'fp32', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (T5Config(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp32',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (t5_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, lucy_edit_task: bool = False, boundary_ratio: float | None = None, precision: str = 'bf16', warp_denoising_step: bool = True)
Bases: TurboDiffusionT2VConfig
Configuration for TurboDiffusion T2V 14B model.
Uses same config as 1.3B but with higher flow_shift for 14B model.
fastvideo.configs.pipelines.turbodiffusion.TurboDiffusionT2V_1_3B_Config
dataclass
¶
TurboDiffusionT2V_1_3B_Config(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = 3.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, scheduler_step_in_fp32: bool = False, is_causal: bool = False, dit_config: DiTConfig = WanVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = WanVAEConfig(), vae_precision: str = 'fp32', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (T5Config(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp32',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (t5_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, lucy_edit_task: bool = False, boundary_ratio: float | None = None, precision: str = 'bf16', warp_denoising_step: bool = True)
fastvideo.configs.pipelines.wan
¶
Classes¶
fastvideo.configs.pipelines.wan.FastWan2_1_T2V_480P_Config
dataclass
¶
FastWan2_1_T2V_480P_Config(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = 8.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, scheduler_step_in_fp32: bool = False, is_causal: bool = False, dit_config: DiTConfig = WanVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = WanVAEConfig(), vae_precision: str = 'fp32', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (T5Config(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp32',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (t5_postprocess_text,))(), dmd_denoising_steps: list[int] | None = (lambda: [1000, 757, 522])(), ti2v_task: bool = False, lucy_edit_task: bool = False, boundary_ratio: float | None = None, precision: str = 'bf16', warp_denoising_step: bool = True)
fastvideo.configs.pipelines.wan.LucyEditDevConfig
dataclass
¶
LucyEditDevConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = 5.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, scheduler_step_in_fp32: bool = False, is_causal: bool = False, dit_config: DiTConfig = (lambda: WanVideoConfig(arch_config=WanVideoArchConfig(num_attention_heads=24, in_channels=96, out_channels=48, ffn_dim=14336, num_layers=30)))(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = (lambda: WanVAEConfig(arch_config=WanVAEArchConfig(base_dim=160, decoder_base_dim=256, z_dim=48, in_channels=12, out_channels=12, scale_factor_spatial=16, patch_size=2, is_residual=True, clip_output=False, latents_mean=(-0.2289, -0.0052, -0.1323, -0.2339, -0.2799, 0.0174, 0.1838, 0.1557, -0.1382, 0.0542, 0.2813, 0.0891, 0.157, -0.0098, 0.0375, -0.1825, -0.2246, -0.1207, -0.0698, 0.5109, 0.2665, -0.2108, -0.2158, 0.2502, -0.2055, -0.0322, 0.1109, 0.1567, -0.0729, 0.0899, -0.2799, -0.123, -0.0313, -0.1649, 0.0117, 0.0723, -0.2839, -0.2083, -0.052, 0.3748, 0.0152, 0.1957, 0.1433, -0.2944, 0.3573, -0.0548, -0.1681, -0.0667), latents_std=(0.4765, 1.0364, 0.4514, 1.1677, 0.5313, 0.499, 0.4818, 0.5013, 0.8158, 1.0344, 0.5894, 1.0901, 0.6885, 0.6165, 0.8454, 0.4978, 0.5759, 0.3523, 0.7135, 0.6804, 0.5833, 1.4146, 0.8986, 0.5659, 0.7069, 0.5338, 0.4889, 0.4917, 0.4069, 0.4999, 0.6866, 0.4093, 0.5709, 0.6065, 0.6415, 0.4944, 0.5726, 1.2042, 0.5458, 1.6887, 0.3971, 1.06, 0.3943, 0.5537, 0.5444, 0.4089, 0.7468, 0.7744))))(), vae_precision: str = 'fp32', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (T5Config(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp32',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (t5_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, lucy_edit_task: bool = True, boundary_ratio: float | None = None, precision: str = 'bf16', warp_denoising_step: bool = True, expand_timesteps: bool = True)
Bases: Wan2_2_TI2V_5B_Config
Configuration for Decart Lucy Edit Dev video editing.
fastvideo.configs.pipelines.wan.WANV2VConfig
dataclass
¶
WANV2VConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = 3.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, scheduler_step_in_fp32: bool = False, is_causal: bool = False, dit_config: DiTConfig = WanVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = WanVAEConfig(), vae_precision: str = 'fp32', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = WAN2_1ControlCLIPVisionConfig(), image_encoder_precision: str = 'bf16', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (T5Config(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp32',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (t5_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, lucy_edit_task: bool = False, boundary_ratio: float | None = None, precision: str = 'bf16', warp_denoising_step: bool = True)
fastvideo.configs.pipelines.wan.WanI2V480PConfig
dataclass
¶
WanI2V480PConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = 3.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, scheduler_step_in_fp32: bool = False, is_causal: bool = False, dit_config: DiTConfig = WanVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = WanVAEConfig(), vae_precision: str = 'fp32', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = CLIPVisionConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (T5Config(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp32',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (t5_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, lucy_edit_task: bool = False, boundary_ratio: float | None = None, precision: str = 'bf16', warp_denoising_step: bool = True)
fastvideo.configs.pipelines.wan.WanI2V720PConfig
dataclass
¶
WanI2V720PConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = 5.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, scheduler_step_in_fp32: bool = False, is_causal: bool = False, dit_config: DiTConfig = WanVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = WanVAEConfig(), vae_precision: str = 'fp32', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = CLIPVisionConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (T5Config(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp32',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (t5_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, lucy_edit_task: bool = False, boundary_ratio: float | None = None, precision: str = 'bf16', warp_denoising_step: bool = True)
fastvideo.configs.pipelines.wan.WanT2V480PConfig
dataclass
¶
WanT2V480PConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = 3.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, scheduler_step_in_fp32: bool = False, is_causal: bool = False, dit_config: DiTConfig = WanVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = WanVAEConfig(), vae_precision: str = 'fp32', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (T5Config(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp32',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (t5_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, lucy_edit_task: bool = False, boundary_ratio: float | None = None, precision: str = 'bf16', warp_denoising_step: bool = True)
fastvideo.configs.pipelines.wan.WanT2V720PConfig
dataclass
¶
WanT2V720PConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = 5.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, scheduler_step_in_fp32: bool = False, is_causal: bool = False, dit_config: DiTConfig = WanVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = WanVAEConfig(), vae_precision: str = 'fp32', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (T5Config(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp32',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (t5_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, lucy_edit_task: bool = False, boundary_ratio: float | None = None, precision: str = 'bf16', warp_denoising_step: bool = True)