configs
¶
Modules¶
fastvideo.configs.configs
¶
Classes¶
fastvideo.configs.configs.DatasetType
¶
Enumeration for different dataset types.
Functions¶
fastvideo.configs.configs.DatasetType.choices
classmethod
¶ fastvideo.configs.configs.DatasetType.from_string
classmethod
¶from_string(value: str) -> DatasetType
Convert string to DatasetType enum.
Source code in fastvideo/configs/configs.py
fastvideo.configs.configs.PreprocessConfig
dataclass
¶
PreprocessConfig(model_path: str = '', dataset_path: str = '', dataset_type: DatasetType = HF, dataset_output_dir: str = './output', dataloader_num_workers: int = 1, preprocess_video_batch_size: int = 2, samples_per_file: int = 64, flush_frequency: int = 256, video_loader_type: VideoLoaderType = TORCHCODEC, max_height: int = 480, max_width: int = 848, num_frames: int = 163, video_length_tolerance_range: float = 2.0, train_fps: int = 30, speed_factor: float = 1.0, drop_short_ratio: float = 1.0, do_temporal_sample: bool = False, training_cfg_rate: float = 0.0, with_audio: bool = False, seed: int = 42)
Configuration for preprocessing operations.
Functions¶
fastvideo.configs.configs.PreprocessConfig.add_cli_args
staticmethod
¶add_cli_args(parser: FlexibleArgumentParser, prefix: str = 'preprocess') -> FlexibleArgumentParser
Add preprocessing configuration arguments to the parser.
Source code in fastvideo/configs/configs.py
92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 | |
fastvideo.configs.configs.PreprocessConfig.from_kwargs
classmethod
¶from_kwargs(kwargs: dict[str, Any]) -> Optional[PreprocessConfig]
Create PreprocessConfig from keyword arguments.
Source code in fastvideo/configs/configs.py
fastvideo.configs.configs.VideoLoaderType
¶
Enumeration for different video loaders.
Functions¶
fastvideo.configs.configs.VideoLoaderType.choices
classmethod
¶ fastvideo.configs.configs.VideoLoaderType.from_string
classmethod
¶from_string(value: str) -> VideoLoaderType
Convert string to VideoLoader enum.
Source code in fastvideo/configs/configs.py
Functions¶
fastvideo.configs.models
¶
Classes¶
fastvideo.configs.models.DiTConfig
dataclass
¶
DiTConfig(arch_config: DiTArchConfig = DiTArchConfig(), prefix: str = '', quant_config: QuantizationConfig | None = None)
Bases: ModelConfig
Functions¶
fastvideo.configs.models.DiTConfig.add_cli_args
staticmethod
¶Add CLI arguments for DiTConfig fields
Source code in fastvideo/configs/models/dits/base.py
fastvideo.configs.models.VAEConfig
dataclass
¶
VAEConfig(arch_config: VAEArchConfig = VAEArchConfig(), load_encoder: bool = True, load_decoder: bool = True, tile_sample_min_height: int = 256, tile_sample_min_width: int = 256, tile_sample_min_num_frames: int = 16, tile_sample_stride_height: int = 192, tile_sample_stride_width: int = 192, tile_sample_stride_num_frames: int = 12, blend_num_frames: int = 0, use_tiling: bool = True, use_temporal_tiling: bool = True, use_parallel_tiling: bool = True)
Bases: ModelConfig
Functions¶
fastvideo.configs.models.VAEConfig.add_cli_args
staticmethod
¶Add CLI arguments for VAEConfig fields
Source code in fastvideo/configs/models/vaes/base.py
Modules¶
fastvideo.configs.models.audio
¶
fastvideo.configs.models.dits
¶
Classes¶
fastvideo.configs.models.dits.Cosmos25VideoConfig
dataclass
¶Cosmos25VideoConfig(arch_config: DiTArchConfig = Cosmos25ArchConfig(), prefix: str = 'Cosmos25', quant_config: QuantizationConfig | None = None)
fastvideo.configs.models.dits.HunyuanGameCraftConfig
dataclass
¶HunyuanGameCraftConfig(arch_config: DiTArchConfig = HunyuanGameCraftArchConfig(), prefix: str = 'HunyuanGameCraft', quant_config: QuantizationConfig | None = None)
fastvideo.configs.models.dits.LTX2VideoConfig
dataclass
¶LTX2VideoConfig(arch_config: DiTArchConfig = LTX2VideoArchConfig(), prefix: str = 'ltx2', quant_config: QuantizationConfig | None = None)
fastvideo.configs.models.dits.LongCatVideoConfig
dataclass
¶LongCatVideoConfig(arch_config: DiTArchConfig = LongCatVideoArchConfig(), prefix: str = 'longcat', quant_config: QuantizationConfig | None = None)
Modules¶
fastvideo.configs.models.dits.base
¶ fastvideo.configs.models.dits.base.DiTConfig
dataclass
¶DiTConfig(arch_config: DiTArchConfig = DiTArchConfig(), prefix: str = '', quant_config: QuantizationConfig | None = None)
Bases: ModelConfig
fastvideo.configs.models.dits.base.DiTConfig.add_cli_args
staticmethod
¶Add CLI arguments for DiTConfig fields
Source code in fastvideo/configs/models/dits/base.py
fastvideo.configs.models.dits.cosmos2_5
¶ fastvideo.configs.models.dits.cosmos2_5.Cosmos25ArchConfig
dataclass
¶Cosmos25ArchConfig(stacked_params_mapping: list[tuple[str, str, str]] = list(), _fsdp_shard_conditions: list = (lambda: [is_transformer_blocks])(), _compile_conditions: list = list(), param_names_mapping: dict = (lambda: {'^net\\.x_embedder\\.proj\\.1\\.(.*)$': 'patch_embed.proj.\\1', '^net\\.t_embedder\\.1\\.linear_1\\.(.*)$': 'time_embed.t_embedder.linear_1.\\1', '^net\\.t_embedder\\.1\\.linear_2\\.(.*)$': 'time_embed.t_embedder.linear_2.\\1', '^net\\.t_embedding_norm\\.(.*)$': 'time_embed.norm.\\1', '^net\\.crossattn_proj\\.0\\.weight$': 'crossattn_proj.0.weight', '^net\\.crossattn_proj\\.0\\.bias$': 'crossattn_proj.0.bias', '^net\\.blocks\\.(\\d+)\\.self_attn\\.q_proj\\.(.*)$': 'transformer_blocks.\\1.attn1.to_q.\\2', '^net\\.blocks\\.(\\d+)\\.self_attn\\.k_proj\\.(.*)$': 'transformer_blocks.\\1.attn1.to_k.\\2', '^net\\.blocks\\.(\\d+)\\.self_attn\\.v_proj\\.(.*)$': 'transformer_blocks.\\1.attn1.to_v.\\2', '^net\\.blocks\\.(\\d+)\\.self_attn\\.output_proj\\.(.*)$': 'transformer_blocks.\\1.attn1.to_out.\\2', '^net\\.blocks\\.(\\d+)\\.self_attn\\.q_norm\\.weight$': 'transformer_blocks.\\1.attn1.norm_q.weight', '^net\\.blocks\\.(\\d+)\\.self_attn\\.k_norm\\.weight$': 'transformer_blocks.\\1.attn1.norm_k.weight', '^net\\.blocks\\.(\\d+)\\.self_attn\\.q_norm\\._extra_state$': 'transformer_blocks.\\1.attn1.norm_q._extra_state', '^net\\.blocks\\.(\\d+)\\.self_attn\\.k_norm\\._extra_state$': 'transformer_blocks.\\1.attn1.norm_k._extra_state', '^net\\.blocks\\.(\\d+)\\.cross_attn\\.q_proj\\.(.*)$': 'transformer_blocks.\\1.attn2.to_q.\\2', '^net\\.blocks\\.(\\d+)\\.cross_attn\\.k_proj\\.(.*)$': 'transformer_blocks.\\1.attn2.to_k.\\2', '^net\\.blocks\\.(\\d+)\\.cross_attn\\.v_proj\\.(.*)$': 'transformer_blocks.\\1.attn2.to_v.\\2', '^net\\.blocks\\.(\\d+)\\.cross_attn\\.output_proj\\.(.*)$': 'transformer_blocks.\\1.attn2.to_out.\\2', '^net\\.blocks\\.(\\d+)\\.cross_attn\\.q_norm\\.weight$': 'transformer_blocks.\\1.attn2.norm_q.weight', '^net\\.blocks\\.(\\d+)\\.cross_attn\\.k_norm\\.weight$': 'transformer_blocks.\\1.attn2.norm_k.weight', '^net\\.blocks\\.(\\d+)\\.cross_attn\\.q_norm\\._extra_state$': 'transformer_blocks.\\1.attn2.norm_q._extra_state', '^net\\.blocks\\.(\\d+)\\.cross_attn\\.k_norm\\._extra_state$': 'transformer_blocks.\\1.attn2.norm_k._extra_state', '^net\\.blocks\\.(\\d+)\\.mlp\\.layer1\\.(.*)$': 'transformer_blocks.\\1.mlp.fc_in.\\2', '^net\\.blocks\\.(\\d+)\\.mlp\\.layer2\\.(.*)$': 'transformer_blocks.\\1.mlp.fc_out.\\2', '^net\\.blocks\\.(\\d+)\\.adaln_modulation_self_attn\\.1\\.(.*)$': 'transformer_blocks.\\1.adaln_modulation_self_attn.1.\\2', '^net\\.blocks\\.(\\d+)\\.adaln_modulation_self_attn\\.2\\.(.*)$': 'transformer_blocks.\\1.adaln_modulation_self_attn.2.\\2', '^net\\.blocks\\.(\\d+)\\.adaln_modulation_cross_attn\\.1\\.(.*)$': 'transformer_blocks.\\1.adaln_modulation_cross_attn.1.\\2', '^net\\.blocks\\.(\\d+)\\.adaln_modulation_cross_attn\\.2\\.(.*)$': 'transformer_blocks.\\1.adaln_modulation_cross_attn.2.\\2', '^net\\.blocks\\.(\\d+)\\.adaln_modulation_mlp\\.1\\.(.*)$': 'transformer_blocks.\\1.adaln_modulation_mlp.1.\\2', '^net\\.blocks\\.(\\d+)\\.adaln_modulation_mlp\\.2\\.(.*)$': 'transformer_blocks.\\1.adaln_modulation_mlp.2.\\2', '^net\\.blocks\\.(\\d+)\\.layer_norm_self_attn\\._extra_state$': 'transformer_blocks.\\1.norm1.norm._extra_state', '^net\\.blocks\\.(\\d+)\\.layer_norm_cross_attn\\._extra_state$': 'transformer_blocks.\\1.norm2.norm._extra_state', '^net\\.blocks\\.(\\d+)\\.layer_norm_mlp\\._extra_state$': 'transformer_blocks.\\1.norm3.norm._extra_state', '^net\\.final_layer\\.linear\\.(.*)$': 'final_layer.proj_out.\\1', '^net\\.final_layer\\.adaln_modulation\\.1\\.(.*)$': 'final_layer.linear_1.\\1', '^net\\.final_layer\\.adaln_modulation\\.2\\.(.*)$': 'final_layer.linear_2.\\1'})(), reverse_param_names_mapping: dict = dict(), lora_param_names_mapping: dict = (lambda: {'^transformer_blocks\\.(\\d+)\\.attn1\\.to_q\\.(.*)$': 'transformer_blocks.\\1.attn1.to_q.\\2', '^transformer_blocks\\.(\\d+)\\.attn1\\.to_k\\.(.*)$': 'transformer_blocks.\\1.attn1.to_k.\\2', '^transformer_blocks\\.(\\d+)\\.attn1\\.to_v\\.(.*)$': 'transformer_blocks.\\1.attn1.to_v.\\2', '^transformer_blocks\\.(\\d+)\\.attn1\\.to_out\\.(.*)$': 'transformer_blocks.\\1.attn1.to_out.\\2', '^transformer_blocks\\.(\\d+)\\.attn2\\.to_q\\.(.*)$': 'transformer_blocks.\\1.attn2.to_q.\\2', '^transformer_blocks\\.(\\d+)\\.attn2\\.to_k\\.(.*)$': 'transformer_blocks.\\1.attn2.to_k.\\2', '^transformer_blocks\\.(\\d+)\\.attn2\\.to_v\\.(.*)$': 'transformer_blocks.\\1.attn2.to_v.\\2', '^transformer_blocks\\.(\\d+)\\.attn2\\.to_out\\.(.*)$': 'transformer_blocks.\\1.attn2.to_out.\\2', '^transformer_blocks\\.(\\d+)\\.mlp\\.(.*)$': 'transformer_blocks.\\1.mlp.\\2'})(), _supported_attention_backends: tuple[AttentionBackendEnum, ...] = (SAGE_ATTN, FLASH_ATTN, TORCH_SDPA, VIDEO_SPARSE_ATTN, VMOBA_ATTN, SAGE_ATTN_THREE, SLA_ATTN, SAGE_SLA_ATTN), hidden_size: int = 0, num_attention_heads: int = 16, num_channels_latents: int = 0, in_channels: int = 16, out_channels: int = 16, exclude_lora_layers: list[str] = (lambda: ['embedder'])(), boundary_ratio: float | None = None, attention_head_dim: int = 128, num_layers: int = 28, mlp_ratio: float = 4.0, text_embed_dim: int = 1024, adaln_lora_dim: int = 256, use_adaln_lora: bool = True, max_size: tuple[int, int, int] = (128, 240, 240), patch_size: tuple[int, int, int] = (1, 2, 2), rope_scale: tuple[float, float, float] = (1.0, 3.0, 3.0), concat_padding_mask: bool = True, extra_pos_embed_type: str | None = None, use_crossattn_projection: bool = False, crossattn_proj_in_channels: int = 100352, rope_enable_fps_modulation: bool = True, qk_norm: str = 'rms_norm', eps: float = 1e-06)
Bases: DiTArchConfig
Configuration for Cosmos 2.5 architecture (MiniTrainDIT).
fastvideo.configs.models.dits.cosmos2_5.Cosmos25VideoConfig
dataclass
¶Cosmos25VideoConfig(arch_config: DiTArchConfig = Cosmos25ArchConfig(), prefix: str = 'Cosmos25', quant_config: QuantizationConfig | None = None)
fastvideo.configs.models.dits.gen3c
¶ fastvideo.configs.models.dits.gen3c.Gen3CArchConfig
dataclass
¶Gen3CArchConfig(stacked_params_mapping: list[tuple[str, str, str]] = list(), _fsdp_shard_conditions: list = (lambda: [is_transformer_blocks])(), _compile_conditions: list = list(), param_names_mapping: dict = (lambda: {'^net\\.x_embedder\\.proj\\.1\\.(.*)$': 'patch_embed.proj.\\1', '^net\\.t_embedder\\.0\\.(.*)$': 'time_embed.time_proj.\\1', '^net\\.t_embedder\\.1\\.linear_1\\.(.*)$': 'time_embed.t_embedder.linear_1.\\1', '^net\\.t_embedder\\.1\\.linear_2\\.(.*)$': 'time_embed.t_embedder.linear_2.\\1', '^net\\.augment_sigma_embedder\\.0\\.(.*)$': 'augment_sigma_embed.time_proj.\\1', '^net\\.augment_sigma_embedder\\.1\\.linear_1\\.(.*)$': 'augment_sigma_embed.t_embedder.linear_1.\\1', '^net\\.augment_sigma_embedder\\.1\\.linear_2\\.(.*)$': 'augment_sigma_embed.t_embedder.linear_2.\\1', '^net\\.affline_norm\\.(.*)$': 'affine_norm.\\1', '^net\\.extra_pos_embedder\\.pos_emb_t$': 'learnable_pos_embed.pos_emb_t', '^net\\.extra_pos_embedder\\.pos_emb_h$': 'learnable_pos_embed.pos_emb_h', '^net\\.extra_pos_embedder\\.pos_emb_w$': 'learnable_pos_embed.pos_emb_w', '^net\\.blocks\\.block(\\d+)\\.blocks\\.0\\.block\\.attn\\.to_q\\.0\\.(.*)$': 'transformer_blocks.\\1.attn1.to_q.\\2', '^net\\.blocks\\.block(\\d+)\\.blocks\\.0\\.block\\.attn\\.to_q\\.1\\.(.*)$': 'transformer_blocks.\\1.attn1.norm_q.\\2', '^net\\.blocks\\.block(\\d+)\\.blocks\\.0\\.block\\.attn\\.to_k\\.0\\.(.*)$': 'transformer_blocks.\\1.attn1.to_k.\\2', '^net\\.blocks\\.block(\\d+)\\.blocks\\.0\\.block\\.attn\\.to_k\\.1\\.(.*)$': 'transformer_blocks.\\1.attn1.norm_k.\\2', '^net\\.blocks\\.block(\\d+)\\.blocks\\.0\\.block\\.attn\\.to_v\\.0\\.(.*)$': 'transformer_blocks.\\1.attn1.to_v.\\2', '^net\\.blocks\\.block(\\d+)\\.blocks\\.0\\.block\\.attn\\.to_out\\.0\\.(.*)$': 'transformer_blocks.\\1.attn1.to_out.\\2', '^net\\.blocks\\.block(\\d+)\\.blocks\\.0\\.adaLN_modulation\\.(.*)$': 'transformer_blocks.\\1.adaln_modulation_self_attn.\\2', '^net\\.blocks\\.block(\\d+)\\.blocks\\.1\\.block\\.attn\\.to_q\\.0\\.(.*)$': 'transformer_blocks.\\1.attn2.to_q.\\2', '^net\\.blocks\\.block(\\d+)\\.blocks\\.1\\.block\\.attn\\.to_q\\.1\\.(.*)$': 'transformer_blocks.\\1.attn2.norm_q.\\2', '^net\\.blocks\\.block(\\d+)\\.blocks\\.1\\.block\\.attn\\.to_k\\.0\\.(.*)$': 'transformer_blocks.\\1.attn2.to_k.\\2', '^net\\.blocks\\.block(\\d+)\\.blocks\\.1\\.block\\.attn\\.to_k\\.1\\.(.*)$': 'transformer_blocks.\\1.attn2.norm_k.\\2', '^net\\.blocks\\.block(\\d+)\\.blocks\\.1\\.block\\.attn\\.to_v\\.0\\.(.*)$': 'transformer_blocks.\\1.attn2.to_v.\\2', '^net\\.blocks\\.block(\\d+)\\.blocks\\.1\\.block\\.attn\\.to_out\\.0\\.(.*)$': 'transformer_blocks.\\1.attn2.to_out.\\2', '^net\\.blocks\\.block(\\d+)\\.blocks\\.1\\.adaLN_modulation\\.(.*)$': 'transformer_blocks.\\1.adaln_modulation_cross_attn.\\2', '^net\\.blocks\\.block(\\d+)\\.blocks\\.2\\.block\\.layer1\\.(.*)$': 'transformer_blocks.\\1.mlp.fc_in.\\2', '^net\\.blocks\\.block(\\d+)\\.blocks\\.2\\.block\\.layer2\\.(.*)$': 'transformer_blocks.\\1.mlp.fc_out.\\2', '^net\\.blocks\\.block(\\d+)\\.blocks\\.2\\.adaLN_modulation\\.(.*)$': 'transformer_blocks.\\1.adaln_modulation_mlp.\\2', '^net\\.final_layer\\.linear\\.(.*)$': 'final_layer.proj_out.\\1', '^net\\.final_layer\\.adaLN_modulation\\.(.*)$': 'final_layer.adaln_modulation.\\1'})(), reverse_param_names_mapping: dict = dict(), lora_param_names_mapping: dict = (lambda: {'^transformer_blocks\\.(\\d+)\\.attn1\\.to_q\\.(.*)$': 'transformer_blocks.\\1.attn1.to_q.\\2', '^transformer_blocks\\.(\\d+)\\.attn1\\.to_k\\.(.*)$': 'transformer_blocks.\\1.attn1.to_k.\\2', '^transformer_blocks\\.(\\d+)\\.attn1\\.to_v\\.(.*)$': 'transformer_blocks.\\1.attn1.to_v.\\2', '^transformer_blocks\\.(\\d+)\\.attn1\\.to_out\\.(.*)$': 'transformer_blocks.\\1.attn1.to_out.\\2', '^transformer_blocks\\.(\\d+)\\.attn2\\.to_q\\.(.*)$': 'transformer_blocks.\\1.attn2.to_q.\\2', '^transformer_blocks\\.(\\d+)\\.attn2\\.to_k\\.(.*)$': 'transformer_blocks.\\1.attn2.to_k.\\2', '^transformer_blocks\\.(\\d+)\\.attn2\\.to_v\\.(.*)$': 'transformer_blocks.\\1.attn2.to_v.\\2', '^transformer_blocks\\.(\\d+)\\.attn2\\.to_out\\.(.*)$': 'transformer_blocks.\\1.attn2.to_out.\\2', '^transformer_blocks\\.(\\d+)\\.mlp\\.(.*)$': 'transformer_blocks.\\1.mlp.\\2'})(), _supported_attention_backends: tuple[AttentionBackendEnum, ...] = (SAGE_ATTN, FLASH_ATTN, TORCH_SDPA, VIDEO_SPARSE_ATTN, VMOBA_ATTN, SAGE_ATTN_THREE, SLA_ATTN, SAGE_SLA_ATTN), hidden_size: int = 0, num_attention_heads: int = 32, num_channels_latents: int = 0, in_channels: int = 16, out_channels: int = 16, exclude_lora_layers: list[str] = (lambda: ['embedder'])(), boundary_ratio: float | None = None, CHANNELS_PER_BUFFER: int = 32, frame_buffer_max: int = 2, attention_head_dim: int = 128, num_layers: int = 28, mlp_ratio: float = 4.0, text_embed_dim: int = 1024, adaln_lora_dim: int = 256, use_adaln_lora: bool = True, add_augment_sigma_embedding: bool = False, max_size: tuple[int, int, int] = (128, 240, 240), patch_size: tuple[int, int, int] = (1, 2, 2), rope_scale: tuple[float, float, float] = (2.0, 1.0, 1.0), extra_pos_embed_type: str = 'learnable', concat_padding_mask: bool = True, use_crossattn_projection: bool = False, rope_enable_fps_modulation: bool = True, qk_norm: str = 'rms_norm', eps: float = 1e-06, affine_emb_norm: bool = True, block_x_format: str = 'THWBD')
Bases: DiTArchConfig
Configuration for GEN3C architecture (VideoExtendGeneralDIT).
fastvideo.configs.models.dits.gen3c.Gen3CVideoConfig
dataclass
¶Gen3CVideoConfig(arch_config: DiTArchConfig = Gen3CArchConfig(), prefix: str = 'Gen3C', quant_config: QuantizationConfig | None = None)
fastvideo.configs.models.dits.hunyuangamecraft
¶Configuration for HunyuanGameCraft transformer model.
HunyuanGameCraft extends HunyuanVideo with: 1. CameraNet for camera/action conditioning 2. 33 input channels (16 latent + 16 gt_latent + 1 mask) 3. Mask-based conditioning for autoregressive generation
fastvideo.configs.models.dits.hunyuangamecraft.HunyuanGameCraftArchConfig
dataclass
¶HunyuanGameCraftArchConfig(stacked_params_mapping: list[tuple[str, str, str]] = list(), _fsdp_shard_conditions: list = (lambda: [is_double_block, is_single_block, is_refiner_block, is_camera_net])(), _compile_conditions: list = (lambda: [is_double_block, is_single_block, is_txt_in])(), param_names_mapping: dict = (lambda: {'^(.*)\\.img_mlp\\.fc1\\.(.*)$': '\\1.img_mlp.fc_in.\\2', '^(.*)\\.img_mlp\\.fc2\\.(.*)$': '\\1.img_mlp.fc_out.\\2', '^(.*)\\.txt_mlp\\.fc1\\.(.*)$': '\\1.txt_mlp.fc_in.\\2', '^(.*)\\.txt_mlp\\.fc2\\.(.*)$': '\\1.txt_mlp.fc_out.\\2', '^single_blocks\\.(\\d+)\\.mlp\\.fc1\\.(.*)$': 'single_blocks.\\1.mlp.fc_in.\\2', '^single_blocks\\.(\\d+)\\.mlp\\.fc2\\.(.*)$': 'single_blocks.\\1.mlp.fc_out.\\2', '^txt_in\\.individual_token_refiner\\.blocks\\.(\\d+)\\.(.*)$': 'txt_in.refiner_blocks.\\1.\\2', '^vector_in\\.in_layer\\.(.*)$': 'vector_in.fc_in.\\1', '^vector_in\\.out_layer\\.(.*)$': 'vector_in.fc_out.\\1', '^time_in\\.mlp\\.0\\.(.*)$': 'time_in.mlp.fc_in.\\1', '^time_in\\.mlp\\.2\\.(.*)$': 'time_in.mlp.fc_out.\\1', '^guidance_in\\.mlp\\.0\\.(.*)$': 'guidance_in.mlp.fc_in.\\1', '^guidance_in\\.mlp\\.2\\.(.*)$': 'guidance_in.mlp.fc_out.\\1', '^final_layer\\.adaLN_modulation\\.1\\.(.*)$': 'final_layer.adaLN_modulation.linear.\\1', '^txt_in\\.refiner_blocks\\.(\\d+)\\.mlp\\.fc1\\.(.*)$': 'txt_in.refiner_blocks.\\1.mlp.fc_in.\\2', '^txt_in\\.refiner_blocks\\.(\\d+)\\.mlp\\.fc2\\.(.*)$': 'txt_in.refiner_blocks.\\1.mlp.fc_out.\\2'})(), reverse_param_names_mapping: dict = (lambda: {})(), lora_param_names_mapping: dict = dict(), _supported_attention_backends: tuple[AttentionBackendEnum, ...] = (SAGE_ATTN, FLASH_ATTN, TORCH_SDPA, VIDEO_SPARSE_ATTN, VMOBA_ATTN, SAGE_ATTN_THREE, SLA_ATTN, SAGE_SLA_ATTN), hidden_size: int = 0, num_attention_heads: int = 24, num_channels_latents: int = 0, in_channels: int = 33, out_channels: int = 16, exclude_lora_layers: list[str] = (lambda: ['img_in', 'txt_in', 'time_in', 'vector_in', 'camera_net'])(), boundary_ratio: float | None = None, _fastvideo_version: str = '0.1.0', camera_net: bool = True, patch_size: int | tuple[int, int, int] = 2, patch_size_t: int = 1, attention_head_dim: int = 128, mlp_ratio: float = 4.0, num_layers: int = 20, num_single_layers: int = 40, num_refiner_layers: int = 2, rope_axes_dim: tuple[int, int, int] = (16, 56, 56), guidance_embeds: bool = False, dtype: dtype | None = None, text_embed_dim: int = 4096, pooled_projection_dim: int = 768, rope_theta: int = 256, qk_norm: str = 'rms_norm', camera_in_channels: int = 6, camera_downscale_coef: int = 8, camera_out_channels: int = 16)
Bases: DiTArchConfig
Architecture config for HunyuanGameCraft transformer.
fastvideo.configs.models.dits.hunyuangamecraft.HunyuanGameCraftConfig
dataclass
¶HunyuanGameCraftConfig(arch_config: DiTArchConfig = HunyuanGameCraftArchConfig(), prefix: str = 'HunyuanGameCraft', quant_config: QuantizationConfig | None = None)
fastvideo.configs.models.dits.longcat
¶LongCat Video DiT configuration for native FastVideo implementation.
fastvideo.configs.models.dits.longcat.LongCatVideoArchConfig
dataclass
¶LongCatVideoArchConfig(stacked_params_mapping: list[tuple[str, str, str]] = list(), _fsdp_shard_conditions: list = (lambda: [is_longcat_blocks])(), _compile_conditions: list = (lambda: [is_longcat_blocks])(), param_names_mapping: dict = (lambda: {'^x_embedder\\.(.*)$': 'patch_embed.\\1', '^t_embedder\\.mlp\\.0\\.(.*)$': 'time_embedder.linear_1.\\1', '^t_embedder\\.mlp\\.2\\.(.*)$': 'time_embedder.linear_2.\\1', '^y_embedder\\.y_proj\\.0\\.(.*)$': 'caption_embedder.linear_1.\\1', '^y_embedder\\.y_proj\\.2\\.(.*)$': 'caption_embedder.linear_2.\\1', '^blocks\\.(\\d+)\\.adaLN_modulation\\.1\\.(.*)$': 'blocks.\\1.adaln_linear_1.\\2', '^blocks\\.(\\d+)\\.mod_norm_attn\\.(.*)$': 'blocks.\\1.norm_attn.\\2', '^blocks\\.(\\d+)\\.mod_norm_ffn\\.(.*)$': 'blocks.\\1.norm_ffn.\\2', '^blocks\\.(\\d+)\\.pre_crs_attn_norm\\.(.*)$': 'blocks.\\1.norm_cross.\\2', '^blocks\\.(\\d+)\\.attn\\.qkv\\.(.*)$': 'blocks.\\1.self_attn.qkv_fused.\\2', '^blocks\\.(\\d+)\\.attn\\.proj\\.(.*)$': 'blocks.\\1.self_attn.to_out.\\2', '^blocks\\.(\\d+)\\.attn\\.q_norm\\.(.*)$': 'blocks.\\1.self_attn.q_norm.\\2', '^blocks\\.(\\d+)\\.attn\\.k_norm\\.(.*)$': 'blocks.\\1.self_attn.k_norm.\\2', '^blocks\\.(\\d+)\\.cross_attn\\.q_linear\\.(.*)$': 'blocks.\\1.cross_attn.to_q.\\2', '^blocks\\.(\\d+)\\.cross_attn\\.kv_linear\\.(.*)$': 'blocks.\\1.cross_attn.kv_fused.\\2', '^blocks\\.(\\d+)\\.cross_attn\\.proj\\.(.*)$': 'blocks.\\1.cross_attn.to_out.\\2', '^blocks\\.(\\d+)\\.cross_attn\\.q_norm\\.(.*)$': 'blocks.\\1.cross_attn.q_norm.\\2', '^blocks\\.(\\d+)\\.cross_attn\\.k_norm\\.(.*)$': 'blocks.\\1.cross_attn.k_norm.\\2', '^blocks\\.(\\d+)\\.ffn\\.w1\\.(.*)$': 'blocks.\\1.ffn.w1.\\2', '^blocks\\.(\\d+)\\.ffn\\.w2\\.(.*)$': 'blocks.\\1.ffn.w2.\\2', '^blocks\\.(\\d+)\\.ffn\\.w3\\.(.*)$': 'blocks.\\1.ffn.w3.\\2', '^final_layer\\.adaLN_modulation\\.1\\.(.*)$': 'final_layer.adaln_linear.\\1', '^final_layer\\.norm_final\\.(.*)$': 'final_layer.norm.\\1', '^final_layer\\.linear\\.(.*)$': 'final_layer.proj.\\1'})(), reverse_param_names_mapping: dict = (lambda: {})(), lora_param_names_mapping: dict = (lambda: {})(), _supported_attention_backends: tuple = (lambda: (FLASH_ATTN, TORCH_SDPA))(), hidden_size: int = 4096, num_attention_heads: int = 32, num_channels_latents: int = 16, in_channels: int = 16, out_channels: int = 16, exclude_lora_layers: list[str] = (lambda: [])(), boundary_ratio: float | None = None, depth: int = 48, attention_head_dim: int = 128, patch_size: tuple[int, int, int] = (1, 2, 2), caption_channels: int = 4096, adaln_tembed_dim: int = 512, frequency_embedding_size: int = 256, mlp_ratio: int = 4, text_tokens_zero_pad: bool = True, enable_bsa: bool = False, bsa_params: dict | None = (lambda: {'sparsity': 0.9375, 'cdf_threshold': None, 'chunk_3d_shape_q': [4, 4, 4], 'chunk_3d_shape_k': [4, 4, 4]})())
Bases: DiTArchConfig
Architecture configuration for native LongCat Video DiT.
fastvideo.configs.models.dits.longcat.LongCatVideoConfig
dataclass
¶LongCatVideoConfig(arch_config: DiTArchConfig = LongCatVideoArchConfig(), prefix: str = 'longcat', quant_config: QuantizationConfig | None = None)
fastvideo.configs.models.dits.longcat.is_longcat_blocks
¶ fastvideo.configs.models.dits.ltx2
¶LTX-2 Transformer configuration for native FastVideo integration.
fastvideo.configs.models.dits.ltx2.LTX2VideoArchConfig
dataclass
¶LTX2VideoArchConfig(stacked_params_mapping: list[tuple[str, str, str]] = list(), _fsdp_shard_conditions: list = (lambda: [is_ltx2_blocks])(), _compile_conditions: list = (lambda: [is_ltx2_blocks])(), param_names_mapping: dict = (lambda: {'^model\\.diffusion_model\\.(.*)$': 'model.\\1', '^diffusion_model\\.(.*)$': 'model.\\1', '^model\\.(.*)$': 'model.\\1', '^(.*)$': 'model.\\1'})(), reverse_param_names_mapping: dict = (lambda: {})(), lora_param_names_mapping: dict = (lambda: {})(), _supported_attention_backends: tuple[AttentionBackendEnum, ...] = (SAGE_ATTN, FLASH_ATTN, TORCH_SDPA, VIDEO_SPARSE_ATTN, VMOBA_ATTN, SAGE_ATTN_THREE, SLA_ATTN, SAGE_SLA_ATTN), hidden_size: int = 0, num_attention_heads: int = 32, num_channels_latents: int = 128, in_channels: int | None = None, out_channels: int | None = None, exclude_lora_layers: list[str] = list(), boundary_ratio: float | None = None, attention_head_dim: int = 128, num_layers: int = 48, cross_attention_dim: int = 4096, caption_channels: int = 3840, norm_eps: float = 1e-06, attention_type: str = 'default', rope_type: str = 'split', double_precision_rope: bool = True, positional_embedding_theta: float = 10000.0, positional_embedding_max_pos: list[int] = (lambda: [20, 2048, 2048])(), timestep_scale_multiplier: int = 1000, use_middle_indices_grid: bool = True, patch_size: tuple[int, int, int] = (1, 1, 1), audio_num_attention_heads: int = 32, audio_attention_head_dim: int = 64, audio_in_channels: int = 128, audio_out_channels: int = 128, audio_cross_attention_dim: int = 2048, audio_positional_embedding_max_pos: list[int] = (lambda: [20])(), av_ca_timestep_scale_multiplier: int = 1)
Bases: DiTArchConfig
Architecture configuration for LTX-2 video transformer.
fastvideo.configs.models.dits.ltx2.LTX2VideoConfig
dataclass
¶LTX2VideoConfig(arch_config: DiTArchConfig = LTX2VideoArchConfig(), prefix: str = 'ltx2', quant_config: QuantizationConfig | None = None)
fastvideo.configs.models.encoders
¶
Classes¶
fastvideo.configs.models.encoders.Reason1ArchConfig
dataclass
¶Reason1ArchConfig(stacked_params_mapping: list[tuple[str, str, str]] = list(), architectures: list[str] = (lambda: ['Qwen2_5_VLForConditionalGeneration'])(), _supported_attention_backends: tuple[AttentionBackendEnum, ...] = (FLASH_ATTN, TORCH_SDPA), output_hidden_states: bool = True, use_return_dict: bool = True, vocab_size: int = 152064, hidden_size: int = 3584, num_hidden_layers: int = 28, num_attention_heads: int = 28, pad_token_id: int = 151643, eos_token_id: int = 151645, text_len: int = 512, hidden_state_skip_layer: int = 0, decoder_start_token_id: int = 0, output_past: bool = True, scalable_attention: bool = True, tie_word_embeddings: bool = False, tokenizer_kwargs: dict[str, Any] = dict(), _fsdp_shard_conditions: list = (lambda: [])(), model_type: str = 'qwen2_5_vl', num_key_value_heads: int = 4, intermediate_size: int = 18944, bos_token_id: int = 151643, image_token_id: int = 151655, video_token_id: int = 151656, vision_token_id: int = 151654, vision_start_token_id: int = 151652, vision_end_token_id: int = 151653, vision_config: dict[str, Any] | None = None, rope_theta: float = 1000000.0, rope_scaling: dict[str, Any] | None = (lambda: {'type': 'mrope', 'mrope_section': [16, 24, 24]})(), max_position_embeddings: int = 128000, max_window_layers: int = 28, embedding_concat_strategy: str = 'mean_pooling', n_layers_per_group: int = 5, num_embedding_padding_tokens: int = 512, attention_dropout: float = 0.0, hidden_act: str = 'silu', initializer_range: float = 0.02, rms_norm_eps: float = 1e-06, use_sliding_window: bool = False, sliding_window: int = 32768, use_cache: bool = False, torch_dtype: str = 'bfloat16', _attn_implementation: str = 'flash_attention_2')
Bases: TextEncoderArchConfig
Architecture settings (defaults match Qwen2.5-VL-7B-Instruct).
fastvideo.configs.models.encoders.Reason1Config
dataclass
¶Reason1Config(arch_config: Reason1ArchConfig = Reason1ArchConfig(), prefix: str = '', quant_config: QuantizationConfig | None = None, lora_config: Any | None = None, is_chat_model: bool = False, tokenizer_type: str = 'Qwen/Qwen2.5-VL-7B-Instruct')
Bases: TextEncoderConfig
Reason1 text encoder config.
fastvideo.configs.models.encoders.SiglipVisionConfig
dataclass
¶SiglipVisionConfig(arch_config: ImageEncoderArchConfig = SiglipVisionArchConfig(), prefix: str = 'siglip', quant_config: QuantizationConfig | None = None, lora_config: Any | None = None, num_hidden_layers_override: int | None = None, require_post_norm: bool | None = None, enable_scale: bool = True, is_causal: bool = False)
Bases: ImageEncoderConfig
Configuration for SigLIP vision encoder.
fastvideo.configs.models.encoders.T5LargeConfig
dataclass
¶T5LargeConfig(arch_config: TextEncoderArchConfig = T5LargeArchConfig(), prefix: str = 't5', quant_config: QuantizationConfig | None = None, lora_config: Any | None = None, is_chat_model: bool = False)
Bases: TextEncoderConfig
T5 Large configuration for your specific model.
Modules¶
fastvideo.configs.models.encoders.reason1
¶Config for Reason1 (Qwen2.5-VL) text encoder.
fastvideo.configs.models.encoders.reason1.Reason1ArchConfig
dataclass
¶Reason1ArchConfig(stacked_params_mapping: list[tuple[str, str, str]] = list(), architectures: list[str] = (lambda: ['Qwen2_5_VLForConditionalGeneration'])(), _supported_attention_backends: tuple[AttentionBackendEnum, ...] = (FLASH_ATTN, TORCH_SDPA), output_hidden_states: bool = True, use_return_dict: bool = True, vocab_size: int = 152064, hidden_size: int = 3584, num_hidden_layers: int = 28, num_attention_heads: int = 28, pad_token_id: int = 151643, eos_token_id: int = 151645, text_len: int = 512, hidden_state_skip_layer: int = 0, decoder_start_token_id: int = 0, output_past: bool = True, scalable_attention: bool = True, tie_word_embeddings: bool = False, tokenizer_kwargs: dict[str, Any] = dict(), _fsdp_shard_conditions: list = (lambda: [])(), model_type: str = 'qwen2_5_vl', num_key_value_heads: int = 4, intermediate_size: int = 18944, bos_token_id: int = 151643, image_token_id: int = 151655, video_token_id: int = 151656, vision_token_id: int = 151654, vision_start_token_id: int = 151652, vision_end_token_id: int = 151653, vision_config: dict[str, Any] | None = None, rope_theta: float = 1000000.0, rope_scaling: dict[str, Any] | None = (lambda: {'type': 'mrope', 'mrope_section': [16, 24, 24]})(), max_position_embeddings: int = 128000, max_window_layers: int = 28, embedding_concat_strategy: str = 'mean_pooling', n_layers_per_group: int = 5, num_embedding_padding_tokens: int = 512, attention_dropout: float = 0.0, hidden_act: str = 'silu', initializer_range: float = 0.02, rms_norm_eps: float = 1e-06, use_sliding_window: bool = False, sliding_window: int = 32768, use_cache: bool = False, torch_dtype: str = 'bfloat16', _attn_implementation: str = 'flash_attention_2')
Bases: TextEncoderArchConfig
Architecture settings (defaults match Qwen2.5-VL-7B-Instruct).
fastvideo.configs.models.encoders.reason1.Reason1Config
dataclass
¶Reason1Config(arch_config: Reason1ArchConfig = Reason1ArchConfig(), prefix: str = '', quant_config: QuantizationConfig | None = None, lora_config: Any | None = None, is_chat_model: bool = False, tokenizer_type: str = 'Qwen/Qwen2.5-VL-7B-Instruct')
Bases: TextEncoderConfig
Reason1 text encoder config.
fastvideo.configs.models.encoders.siglip
¶SigLIP vision encoder configuration for FastVideo.
fastvideo.configs.models.encoders.siglip.SiglipVisionArchConfig
dataclass
¶SiglipVisionArchConfig(stacked_params_mapping: list = (lambda: [('qkv_proj', 'q_proj', 'q'), ('qkv_proj', 'k_proj', 'k'), ('qkv_proj', 'v_proj', 'v')])(), architectures: list[str] = (lambda: ['SiglipVisionModel'])(), _supported_attention_backends: tuple[AttentionBackendEnum, ...] = (FLASH_ATTN, TORCH_SDPA), output_hidden_states: bool = False, use_return_dict: bool = True, attention_dropout: float = 0.0, dtype: str | None = None, hidden_act: str = 'gelu_pytorch_tanh', hidden_size: int = 1152, image_size: int = 384, intermediate_size: int = 4304, layer_norm_eps: float = 1e-06, model_type: str = 'siglip_vision_model', num_attention_heads: int = 16, num_channels: int = 3, num_hidden_layers: int = 27, patch_size: int = 14)
Bases: ImageEncoderArchConfig
Architecture configuration for SigLIP vision encoder.
Fields match the config.json from HuggingFace SigLIP checkpoints.
fastvideo.configs.models.encoders.siglip.SiglipVisionConfig
dataclass
¶SiglipVisionConfig(arch_config: ImageEncoderArchConfig = SiglipVisionArchConfig(), prefix: str = 'siglip', quant_config: QuantizationConfig | None = None, lora_config: Any | None = None, num_hidden_layers_override: int | None = None, require_post_norm: bool | None = None, enable_scale: bool = True, is_causal: bool = False)
Bases: ImageEncoderConfig
Configuration for SigLIP vision encoder.
fastvideo.configs.models.encoders.t5
¶ fastvideo.configs.models.encoders.t5.T5LargeArchConfig
dataclass
¶T5LargeArchConfig(stacked_params_mapping: list[tuple[str, str, str]] = (lambda: [('.qkv_proj', '.q', 'q'), ('.qkv_proj', '.k', 'k'), ('.qkv_proj', '.v', 'v')])(), architectures: list[str] = (lambda: [])(), _supported_attention_backends: tuple[AttentionBackendEnum, ...] = (FLASH_ATTN, TORCH_SDPA), output_hidden_states: bool = False, use_return_dict: bool = True, vocab_size: int = 32128, hidden_size: int = 0, num_hidden_layers: int = 0, num_attention_heads: int = 0, pad_token_id: int = 0, eos_token_id: int = 1, text_len: int = 512, hidden_state_skip_layer: int = 0, decoder_start_token_id: int = 0, output_past: bool = True, scalable_attention: bool = True, tie_word_embeddings: bool = False, tokenizer_kwargs: dict[str, Any] = dict(), _fsdp_shard_conditions: list = (lambda: [_is_transformer_layer, _is_embeddings, _is_final_layernorm])(), d_model: int = 1024, d_kv: int = 128, d_ff: int = 65536, num_layers: int = 24, num_decoder_layers: int | None = 24, num_heads: int = 128, relative_attention_num_buckets: int = 32, relative_attention_max_distance: int = 128, dropout_rate: float = 0.1, layer_norm_epsilon: float = 1e-06, initializer_factor: float = 1.0, feed_forward_proj: str = 'relu', dense_act_fn: str = '', is_gated_act: bool = False, is_encoder_decoder: bool = True, use_cache: bool = True, classifier_dropout: float = 0.0, dtype: str | None = None, gradient_checkpointing: bool = False, n_positions: int = 512, task_specific_params: dict | None = None)
Bases: T5ArchConfig
T5 Large architecture config with parameters for your specific model.
fastvideo.configs.models.encoders.t5.T5LargeConfig
dataclass
¶T5LargeConfig(arch_config: TextEncoderArchConfig = T5LargeArchConfig(), prefix: str = 't5', quant_config: QuantizationConfig | None = None, lora_config: Any | None = None, is_chat_model: bool = False)
Bases: TextEncoderConfig
T5 Large configuration for your specific model.
fastvideo.configs.models.vaes
¶
Classes¶
fastvideo.configs.models.vaes.Cosmos25VAEConfig
dataclass
¶Cosmos25VAEConfig(arch_config: Cosmos25VAEArchConfig = Cosmos25VAEArchConfig(), load_encoder: bool = True, load_decoder: bool = True, tile_sample_min_height: int = 256, tile_sample_min_width: int = 256, tile_sample_min_num_frames: int = 16, tile_sample_stride_height: int = 192, tile_sample_stride_width: int = 192, tile_sample_stride_num_frames: int = 12, blend_num_frames: int = 0, use_tiling: bool = False, use_temporal_tiling: bool = False, use_parallel_tiling: bool = False, use_feature_cache: bool = True)
fastvideo.configs.models.vaes.GameCraftVAEConfig
dataclass
¶GameCraftVAEConfig(arch_config: VAEArchConfig = GameCraftVAEArchConfig(), load_encoder: bool = True, load_decoder: bool = True, tile_sample_min_height: int = 256, tile_sample_min_width: int = 256, tile_sample_min_num_frames: int = 16, tile_sample_stride_height: int = 192, tile_sample_stride_width: int = 192, tile_sample_stride_num_frames: int = 12, blend_num_frames: int = 0, use_tiling: bool = True, use_temporal_tiling: bool = True, use_parallel_tiling: bool = True)
fastvideo.configs.models.vaes.Gen3CVAEConfig
dataclass
¶Gen3CVAEConfig(arch_config: CosmosVAEArchConfig = CosmosVAEArchConfig(), load_encoder: bool = True, load_decoder: bool = True, tile_sample_min_height: int = 256, tile_sample_min_width: int = 256, tile_sample_min_num_frames: int = 16, tile_sample_stride_height: int = 192, tile_sample_stride_width: int = 192, tile_sample_stride_num_frames: int = 12, blend_num_frames: int = 0, use_tiling: bool = False, use_temporal_tiling: bool = False, use_parallel_tiling: bool = False, use_feature_cache: bool = True)
Bases: CosmosVAEConfig
GEN3C VAE config placeholder.
GEN3C uses tokenizer-backed VAE loading logic at runtime, but we keep a model-specific config class so pipeline/model configs stay model-scoped.
Modules¶
fastvideo.configs.models.vaes.base
¶ fastvideo.configs.models.vaes.base.VAEConfig
dataclass
¶VAEConfig(arch_config: VAEArchConfig = VAEArchConfig(), load_encoder: bool = True, load_decoder: bool = True, tile_sample_min_height: int = 256, tile_sample_min_width: int = 256, tile_sample_min_num_frames: int = 16, tile_sample_stride_height: int = 192, tile_sample_stride_width: int = 192, tile_sample_stride_num_frames: int = 12, blend_num_frames: int = 0, use_tiling: bool = True, use_temporal_tiling: bool = True, use_parallel_tiling: bool = True)
Bases: ModelConfig
fastvideo.configs.models.vaes.base.VAEConfig.add_cli_args
staticmethod
¶Add CLI arguments for VAEConfig fields
Source code in fastvideo/configs/models/vaes/base.py
fastvideo.configs.models.vaes.cosmos2_5vae
¶Cosmos 2.5 (Wan2.1-style) VAE config and checkpoint-key mapping.
fastvideo.configs.models.vaes.cosmos2_5vae.Cosmos25VAEArchConfig
dataclass
¶Cosmos25VAEArchConfig(stacked_params_mapping: list[tuple[str, str, str]] = list(), scaling_factor: float | Tensor = 0, temporal_compression_ratio: int = 4, spatial_compression_ratio: int = 8, _name_or_path: str = '', base_dim: int = 96, decoder_base_dim: int | None = None, z_dim: int = 16, dim_mult: tuple[int, ...] = (1, 2, 4, 4), num_res_blocks: int = 2, attn_scales: tuple[float, ...] = (), temperal_downsample: tuple[bool, ...] = (False, True, True), dropout: float = 0.0, is_residual: bool = False, in_channels: int = 3, out_channels: int = 3, patch_size: int | None = None, scale_factor_temporal: int = 4, scale_factor_spatial: int = 8, clip_output: bool = True, latents_mean: tuple[float, ...] = (-0.7571, -0.7089, -0.9113, 0.1075, -0.1745, 0.9653, -0.1517, 1.5508, 0.4134, -0.0715, 0.5517, -0.3632, -0.1922, -0.9497, 0.2503, -0.2921), latents_std: tuple[float, ...] = (2.8184, 1.4541, 2.3275, 2.6558, 1.2196, 1.7708, 2.6052, 2.0743, 3.2687, 2.1526, 2.8652, 1.5579, 1.6382, 1.1253, 2.8251, 1.916), param_names_mapping: dict[str, str] = (lambda: {'^conv1\\.(.*)$': 'quant_conv.\\1', '^conv2\\.(.*)$': 'post_quant_conv.\\1', '^encoder\\.conv1\\.(.*)$': 'encoder.conv_in.\\1', '^decoder\\.conv1\\.(.*)$': 'decoder.conv_in.\\1', '^encoder\\.head\\.0\\.gamma$': 'encoder.norm_out.gamma', '^encoder\\.head\\.2\\.(.*)$': 'encoder.conv_out.\\1', '^decoder\\.head\\.0\\.gamma$': 'decoder.norm_out.gamma', '^decoder\\.head\\.2\\.(.*)$': 'decoder.conv_out.\\1'})())
Bases: VAEArchConfig
fastvideo.configs.models.vaes.cosmos2_5vae.Cosmos25VAEArchConfig.map_official_key
staticmethod
¶Map a single official checkpoint key into FastVideo key space.
Source code in fastvideo/configs/models/vaes/cosmos2_5vae.py
83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 | |
fastvideo.configs.models.vaes.cosmos2_5vae.Cosmos25VAEConfig
dataclass
¶Cosmos25VAEConfig(arch_config: Cosmos25VAEArchConfig = Cosmos25VAEArchConfig(), load_encoder: bool = True, load_decoder: bool = True, tile_sample_min_height: int = 256, tile_sample_min_width: int = 256, tile_sample_min_num_frames: int = 16, tile_sample_stride_height: int = 192, tile_sample_stride_width: int = 192, tile_sample_stride_num_frames: int = 12, blend_num_frames: int = 0, use_tiling: bool = False, use_temporal_tiling: bool = False, use_parallel_tiling: bool = False, use_feature_cache: bool = True)
fastvideo.configs.models.vaes.gamecraftvae
¶GameCraft VAE config - matches official config.json from Hunyuan-GameCraft-1.0.
fastvideo.configs.models.vaes.gamecraftvae.GameCraftVAEArchConfig
dataclass
¶GameCraftVAEArchConfig(stacked_params_mapping: list[tuple[str, str, str]] = list(), scaling_factor: float = 0.476986, temporal_compression_ratio: int = 4, spatial_compression_ratio: int = 8, in_channels: int = 3, out_channels: int = 3, latent_channels: int = 16, down_block_types: tuple[str, ...] = ('DownEncoderBlockCausal3D', 'DownEncoderBlockCausal3D', 'DownEncoderBlockCausal3D', 'DownEncoderBlockCausal3D'), up_block_types: tuple[str, ...] = ('UpDecoderBlockCausal3D', 'UpDecoderBlockCausal3D', 'UpDecoderBlockCausal3D', 'UpDecoderBlockCausal3D'), block_out_channels: tuple[int, ...] = (128, 256, 512, 512), layers_per_block: int = 2, act_fn: str = 'silu', norm_num_groups: int = 32, time_compression_ratio: int = 4, mid_block_add_attention: bool = True, mid_block_causal_attn: bool = True, sample_size: int = 256, sample_tsize: int = 64)
Bases: VAEArchConfig
Architecture config matching official AutoencoderKLCausal3D config.json.
fastvideo.configs.models.vaes.gamecraftvae.GameCraftVAEConfig
dataclass
¶GameCraftVAEConfig(arch_config: VAEArchConfig = GameCraftVAEArchConfig(), load_encoder: bool = True, load_decoder: bool = True, tile_sample_min_height: int = 256, tile_sample_min_width: int = 256, tile_sample_min_num_frames: int = 16, tile_sample_stride_height: int = 192, tile_sample_stride_width: int = 192, tile_sample_stride_num_frames: int = 12, blend_num_frames: int = 0, use_tiling: bool = True, use_temporal_tiling: bool = True, use_parallel_tiling: bool = True)
fastvideo.configs.models.vaes.gen3cvae
¶ fastvideo.configs.models.vaes.gen3cvae.Gen3CVAEConfig
dataclass
¶Gen3CVAEConfig(arch_config: CosmosVAEArchConfig = CosmosVAEArchConfig(), load_encoder: bool = True, load_decoder: bool = True, tile_sample_min_height: int = 256, tile_sample_min_width: int = 256, tile_sample_min_num_frames: int = 16, tile_sample_stride_height: int = 192, tile_sample_stride_width: int = 192, tile_sample_stride_num_frames: int = 12, blend_num_frames: int = 0, use_tiling: bool = False, use_temporal_tiling: bool = False, use_parallel_tiling: bool = False, use_feature_cache: bool = True)
Bases: CosmosVAEConfig
GEN3C VAE config placeholder.
GEN3C uses tokenizer-backed VAE loading logic at runtime, but we keep a model-specific config class so pipeline/model configs stay model-scoped.
fastvideo.configs.pipelines
¶
Classes¶
fastvideo.configs.pipelines.Cosmos25Config
dataclass
¶
Cosmos25Config(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 0.0, flow_shift: float = 5.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = (lambda: Cosmos25VideoConfig(arch_config=Cosmos25ArchConfig(num_attention_heads=16, attention_head_dim=128, in_channels=16, out_channels=16, num_layers=28, patch_size=[1, 2, 2], max_size=[128, 240, 240], rope_scale=[1.0, 3.0, 3.0], text_embed_dim=1024, mlp_ratio=4.0, adaln_lora_dim=256, use_adaln_lora=True, concat_padding_mask=True, extra_pos_embed_type=None, use_crossattn_projection=True, rope_enable_fps_modulation=False, qk_norm='rms_norm')))(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = Cosmos25VAEConfig(), vae_precision: str = 'bf16', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (Reason1Config(arch_config=Reason1ArchConfig(embedding_concat_strategy='full_concat')),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('bf16',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (_identity_preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (reason1_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None)
fastvideo.configs.pipelines.CosmosConfig
dataclass
¶
CosmosConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: int = 6, flow_shift: float = 1.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = CosmosVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = CosmosVAEConfig(), vae_precision: str = 'fp16', vae_tiling: bool = True, vae_sp: bool = True, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (T5LargeConfig(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('bf16',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (t5_large_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None, conditioning_strategy: str = 'frame_replace', min_num_conditional_frames: int = 1, max_num_conditional_frames: int = 2, sigma_conditional: float = 0.0001, sigma_data: float = 1.0, state_ch: int = 16, state_t: int = 24, text_encoder_class: str = 'T5')
fastvideo.configs.pipelines.FastHunyuanConfig
dataclass
¶
FastHunyuanConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: int = 6, flow_shift: int = 17, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = HunyuanVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = HunyuanVAEConfig(), vae_precision: str = 'fp16', vae_tiling: bool = True, vae_sp: bool = True, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (LlamaConfig(), CLIPTextConfig()))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp16', 'fp16'))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (llama_preprocess_text, clip_preprocess_text))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], tensor], ...] = (lambda: (llama_postprocess_text, clip_postprocess_text))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None)
fastvideo.configs.pipelines.HYWorldConfig
dataclass
¶
HYWorldConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: int = 5, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = HYWorldConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = Hunyuan15VAEConfig(), vae_precision: str = 'fp16', vae_tiling: bool = True, vae_sp: bool = True, image_encoder_config: EncoderConfig = SiglipVisionConfig(), image_encoder_precision: str = 'fp16', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (Qwen2_5_VLConfig(), T5Config()))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp16', 'fp32'))(), preprocess_text_funcs: tuple[Callable[[Any], Any], ...] = (lambda: (qwen_preprocess_text, byt5_preprocess_text))(), postprocess_text_funcs: tuple[Callable[..., Any], ...] = (lambda: (qwen_postprocess_text, byt5_postprocess_text))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None, text_encoder_crop_start: int = PROMPT_TEMPLATE_TOKEN_LENGTH, text_encoder_max_lengths: tuple[int, ...] = (lambda: (1000 + PROMPT_TEMPLATE_TOKEN_LENGTH, 256))())
fastvideo.configs.pipelines.Hunyuan15T2V480PConfig
dataclass
¶
Hunyuan15T2V480PConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: int = 5, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = HunyuanVideo15Config(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = Hunyuan15VAEConfig(), vae_precision: str = 'fp16', vae_tiling: bool = True, vae_sp: bool = True, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (Qwen2_5_VLConfig(), T5Config()))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('bf16', 'fp32'))(), preprocess_text_funcs: tuple[Callable[[Any], Any], ...] = (lambda: (qwen_preprocess_text, byt5_preprocess_text))(), postprocess_text_funcs: tuple[Callable[..., Any], ...] = (lambda: (qwen_postprocess_text, byt5_postprocess_text))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None, text_encoder_crop_start: int = PROMPT_TEMPLATE_TOKEN_LENGTH, text_encoder_max_lengths: tuple[int, ...] = (lambda: (1000 + PROMPT_TEMPLATE_TOKEN_LENGTH, 256))())
fastvideo.configs.pipelines.Hunyuan15T2V720PConfig
dataclass
¶
Hunyuan15T2V720PConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: int = 9, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = HunyuanVideo15Config(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = Hunyuan15VAEConfig(), vae_precision: str = 'fp16', vae_tiling: bool = True, vae_sp: bool = True, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (Qwen2_5_VLConfig(), T5Config()))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('bf16', 'fp32'))(), preprocess_text_funcs: tuple[Callable[[Any], Any], ...] = (lambda: (qwen_preprocess_text, byt5_preprocess_text))(), postprocess_text_funcs: tuple[Callable[..., Any], ...] = (lambda: (qwen_postprocess_text, byt5_postprocess_text))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None, text_encoder_crop_start: int = PROMPT_TEMPLATE_TOKEN_LENGTH, text_encoder_max_lengths: tuple[int, ...] = (lambda: (1000 + PROMPT_TEMPLATE_TOKEN_LENGTH, 256))())
fastvideo.configs.pipelines.HunyuanConfig
dataclass
¶
HunyuanConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: int = 6, flow_shift: int = 7, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = HunyuanVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = HunyuanVAEConfig(), vae_precision: str = 'fp16', vae_tiling: bool = True, vae_sp: bool = True, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (LlamaConfig(), CLIPTextConfig()))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp16', 'fp16'))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (llama_preprocess_text, clip_preprocess_text))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], tensor], ...] = (lambda: (llama_postprocess_text, clip_postprocess_text))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None)
fastvideo.configs.pipelines.HunyuanGameCraftPipelineConfig
dataclass
¶
HunyuanGameCraftPipelineConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: int = 5, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = HunyuanGameCraftConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = GameCraftVAEConfig(), vae_precision: str = 'fp16', vae_tiling: bool = True, vae_sp: bool = True, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (LlamaConfig(), CLIPTextConfig()))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp16', 'fp16'))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (llama_preprocess_text, clip_preprocess_text))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (llama_postprocess_text, clip_postprocess_text))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None)
Bases: PipelineConfig
Configuration for HunyuanGameCraft pipeline.
Inherits text encoding from HunyuanVideo but uses: - GameCraft DiT with CameraNet - Same VAE (HunyuanVAE) - Same text encoders (LLaMA + CLIP)
fastvideo.configs.pipelines.LTX2T2VConfig
dataclass
¶
LTX2T2VConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = None, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = LTX2VideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = LTX2VAEConfig(), vae_precision: str = 'bf16', vae_tiling: bool = True, vae_sp: bool = False, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (LTX2GemmaConfig(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('bf16',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (ltx2_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None, audio_decoder_config: ModelConfig = LTX2AudioDecoderConfig(), vocoder_config: ModelConfig = LTX2VocoderConfig(), audio_decoder_precision: str = 'bf16', vocoder_precision: str = 'bf16')
fastvideo.configs.pipelines.PipelineConfig
dataclass
¶
PipelineConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = None, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = DiTConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = VAEConfig(), vae_precision: str = 'fp32', vae_tiling: bool = True, vae_sp: bool = True, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (EncoderConfig(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp32',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], tensor], ...] = (lambda: (postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None)
Base configuration for all pipeline architectures.
Functions¶
fastvideo.configs.pipelines.PipelineConfig.from_kwargs
classmethod
¶from_kwargs(kwargs: dict[str, Any], config_cli_prefix: str = '') -> PipelineConfig
Load PipelineConfig from kwargs Dictionary. kwargs: dictionary of kwargs config_cli_prefix: prefix of CLI arguments for this PipelineConfig instance
Source code in fastvideo/configs/pipelines/base.py
fastvideo.configs.pipelines.PipelineConfig.from_pretrained
classmethod
¶from_pretrained(model_path: str) -> PipelineConfig
use the pipeline class setting from model_path to match the pipeline config
Source code in fastvideo/configs/pipelines/base.py
fastvideo.configs.pipelines.WanI2V480PConfig
dataclass
¶
WanI2V480PConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = 3.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = WanVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = WanVAEConfig(), vae_precision: str = 'fp32', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = CLIPVisionConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (T5Config(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp32',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (t5_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None, precision: str = 'bf16', warp_denoising_step: bool = True)
fastvideo.configs.pipelines.WanI2V720PConfig
dataclass
¶
WanI2V720PConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = 5.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = WanVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = WanVAEConfig(), vae_precision: str = 'fp32', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = CLIPVisionConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (T5Config(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp32',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (t5_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None, precision: str = 'bf16', warp_denoising_step: bool = True)
fastvideo.configs.pipelines.WanT2V480PConfig
dataclass
¶
WanT2V480PConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = 3.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = WanVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = WanVAEConfig(), vae_precision: str = 'fp32', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (T5Config(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp32',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (t5_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None, precision: str = 'bf16', warp_denoising_step: bool = True)
fastvideo.configs.pipelines.WanT2V720PConfig
dataclass
¶
WanT2V720PConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = 5.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = WanVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = WanVAEConfig(), vae_precision: str = 'fp32', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (T5Config(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp32',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (t5_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None, precision: str = 'bf16', warp_denoising_step: bool = True)
Modules¶
fastvideo.configs.pipelines.base
¶
Classes¶
fastvideo.configs.pipelines.base.PipelineConfig
dataclass
¶PipelineConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = None, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = DiTConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = VAEConfig(), vae_precision: str = 'fp32', vae_tiling: bool = True, vae_sp: bool = True, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (EncoderConfig(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp32',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], tensor], ...] = (lambda: (postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None)
Base configuration for all pipeline architectures.
fastvideo.configs.pipelines.base.PipelineConfig.from_kwargs
classmethod
¶from_kwargs(kwargs: dict[str, Any], config_cli_prefix: str = '') -> PipelineConfig
Load PipelineConfig from kwargs Dictionary. kwargs: dictionary of kwargs config_cli_prefix: prefix of CLI arguments for this PipelineConfig instance
Source code in fastvideo/configs/pipelines/base.py
fastvideo.configs.pipelines.base.PipelineConfig.from_pretrained
classmethod
¶from_pretrained(model_path: str) -> PipelineConfig
use the pipeline class setting from model_path to match the pipeline config
Source code in fastvideo/configs/pipelines/base.py
Functions¶
fastvideo.configs.pipelines.base.parse_int_list
¶Parse a comma-separated string of integers into a list.
fastvideo.configs.pipelines.cosmos
¶
Classes¶
fastvideo.configs.pipelines.cosmos.CosmosConfig
dataclass
¶CosmosConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: int = 6, flow_shift: float = 1.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = CosmosVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = CosmosVAEConfig(), vae_precision: str = 'fp16', vae_tiling: bool = True, vae_sp: bool = True, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (T5LargeConfig(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('bf16',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (t5_large_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None, conditioning_strategy: str = 'frame_replace', min_num_conditional_frames: int = 1, max_num_conditional_frames: int = 2, sigma_conditional: float = 0.0001, sigma_data: float = 1.0, state_ch: int = 16, state_t: int = 24, text_encoder_class: str = 'T5')
Functions¶
fastvideo.configs.pipelines.cosmos.t5_large_postprocess_text
¶Postprocess T5 Large text encoder outputs for Cosmos pipeline.
Return raw last_hidden_state without truncation/padding.
Source code in fastvideo/configs/pipelines/cosmos.py
fastvideo.configs.pipelines.cosmos2_5
¶
Classes¶
fastvideo.configs.pipelines.cosmos2_5.Cosmos25Config
dataclass
¶Cosmos25Config(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 0.0, flow_shift: float = 5.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = (lambda: Cosmos25VideoConfig(arch_config=Cosmos25ArchConfig(num_attention_heads=16, attention_head_dim=128, in_channels=16, out_channels=16, num_layers=28, patch_size=[1, 2, 2], max_size=[128, 240, 240], rope_scale=[1.0, 3.0, 3.0], text_embed_dim=1024, mlp_ratio=4.0, adaln_lora_dim=256, use_adaln_lora=True, concat_padding_mask=True, extra_pos_embed_type=None, use_crossattn_projection=True, rope_enable_fps_modulation=False, qk_norm='rms_norm')))(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = Cosmos25VAEConfig(), vae_precision: str = 'bf16', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (Reason1Config(arch_config=Reason1ArchConfig(embedding_concat_strategy='full_concat')),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('bf16',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (_identity_preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (reason1_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None)
fastvideo.configs.pipelines.gen3c
¶
Classes¶
fastvideo.configs.pipelines.gen3c.Gen3CConfig
dataclass
¶Gen3CConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: int = 6, flow_shift: float = 1.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = Gen3CVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = Gen3CVAEConfig(), vae_precision: str = 'bf16', vae_tiling: bool = True, vae_sp: bool = True, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (_Gen3CT5LargeConfig(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('bf16',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (t5_large_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None, conditioning_strategy: str = 'frame_replace', min_num_conditional_frames: int = 1, max_num_conditional_frames: int = 2, sigma_conditional: float = 0.001, sigma_data: float = 0.5, state_ch: int = 16, state_t: int = 16, text_encoder_class: str = 'T5', frame_buffer_max: int = 2, noise_aug_strength: float = 0.0, filter_points_threshold: float = 0.05, use_moge_depth: bool = True, moge_model_name: str = 'Ruicheng/moge-vitl', offload_moge_after_depth: bool = True, default_trajectory_type: str = 'left', default_movement_distance: float = 0.3, default_camera_rotation: str = 'center_facing', video_resolution: tuple[int, int] = (704, 1280), num_frames: int = 121, fps: int = 24, cfg_behavior: str = 'legacy', default_negative_prompt: str = 'The video captures a series of frames showing ugly scenes, static with no motion, motion blur, over-saturation, shaky footage, low resolution, grainy texture, pixelated images, poorly lit areas, underexposed and overexposed scenes, poor color balance, washed out colors, choppy sequences, jerky movements, low frame rate, artifacting, color banding, unnatural transitions, outdated special effects, fake elements, unconvincing visuals, poorly edited content, jump cuts, visual noise, and flickering. Overall, the video is of poor quality.', autoregressive_chunk_frames: int = 121, autoregressive_overlap_frames: int = 1)
Bases: PipelineConfig
Configuration for GEN3C Video Generation Pipeline.
GEN3C extends Cosmos with 3D cache for camera-controlled video generation. Key parameters: - frame_buffer_max: Number of 3D cache buffers (default: 2) - noise_aug_strength: Strength of noise augmentation per buffer - filter_points_threshold: Threshold for filtering unreliable depth points
fastvideo.configs.pipelines.gen3c.Gen3CInferenceConfig
dataclass
¶Gen3CInferenceConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: int = 6, flow_shift: float = 1.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = Gen3CVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = Gen3CVAEConfig(), vae_precision: str = 'bf16', vae_tiling: bool = True, vae_sp: bool = True, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (_Gen3CT5LargeConfig(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('bf16',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (t5_large_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None, conditioning_strategy: str = 'frame_replace', min_num_conditional_frames: int = 1, max_num_conditional_frames: int = 2, sigma_conditional: float = 0.001, sigma_data: float = 0.5, state_ch: int = 16, state_t: int = 16, text_encoder_class: str = 'T5', frame_buffer_max: int = 2, noise_aug_strength: float = 0.0, filter_points_threshold: float = 0.05, use_moge_depth: bool = True, moge_model_name: str = 'Ruicheng/moge-vitl', offload_moge_after_depth: bool = True, default_trajectory_type: str = 'left', default_movement_distance: float = 0.3, default_camera_rotation: str = 'center_facing', video_resolution: tuple[int, int] = (704, 1280), num_frames: int = 121, fps: int = 24, cfg_behavior: str = 'legacy', default_negative_prompt: str = 'The video captures a series of frames showing ugly scenes, static with no motion, motion blur, over-saturation, shaky footage, low resolution, grainy texture, pixelated images, poorly lit areas, underexposed and overexposed scenes, poor color balance, washed out colors, choppy sequences, jerky movements, low frame rate, artifacting, color banding, unnatural transitions, outdated special effects, fake elements, unconvincing visuals, poorly edited content, jump cuts, visual noise, and flickering. Overall, the video is of poor quality.', autoregressive_chunk_frames: int = 121, autoregressive_overlap_frames: int = 1, batch_size: int = 1, gradient_checkpointing: bool = False, guidance_scale: float = 1.0, num_inference_steps: int = 35)
Functions¶
fastvideo.configs.pipelines.gen3c.t5_large_postprocess_text
¶Postprocess T5 Large text encoder outputs for GEN3C pipeline.
Return raw last_hidden_state without truncation/padding.
Source code in fastvideo/configs/pipelines/gen3c.py
fastvideo.configs.pipelines.hunyuan
¶
Classes¶
fastvideo.configs.pipelines.hunyuan.FastHunyuanConfig
dataclass
¶FastHunyuanConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: int = 6, flow_shift: int = 17, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = HunyuanVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = HunyuanVAEConfig(), vae_precision: str = 'fp16', vae_tiling: bool = True, vae_sp: bool = True, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (LlamaConfig(), CLIPTextConfig()))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp16', 'fp16'))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (llama_preprocess_text, clip_preprocess_text))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], tensor], ...] = (lambda: (llama_postprocess_text, clip_postprocess_text))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None)
fastvideo.configs.pipelines.hunyuan.HunyuanConfig
dataclass
¶HunyuanConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: int = 6, flow_shift: int = 7, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = HunyuanVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = HunyuanVAEConfig(), vae_precision: str = 'fp16', vae_tiling: bool = True, vae_sp: bool = True, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (LlamaConfig(), CLIPTextConfig()))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp16', 'fp16'))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (llama_preprocess_text, clip_preprocess_text))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], tensor], ...] = (lambda: (llama_postprocess_text, clip_postprocess_text))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None)
fastvideo.configs.pipelines.hunyuan15
¶
Classes¶
fastvideo.configs.pipelines.hunyuan15.Hunyuan15I2V720PConfig
dataclass
¶Hunyuan15I2V720PConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: int = 7, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = HunyuanVideo15Config(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = Hunyuan15VAEConfig(), vae_precision: str = 'fp16', vae_tiling: bool = True, vae_sp: bool = True, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (Qwen2_5_VLConfig(), T5Config()))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('bf16', 'fp32'))(), preprocess_text_funcs: tuple[Callable[[Any], Any], ...] = (lambda: (qwen_preprocess_text, byt5_preprocess_text))(), postprocess_text_funcs: tuple[Callable[..., Any], ...] = (lambda: (qwen_postprocess_text, byt5_postprocess_text))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None, text_encoder_crop_start: int = PROMPT_TEMPLATE_TOKEN_LENGTH, text_encoder_max_lengths: tuple[int, ...] = (lambda: (1000 + PROMPT_TEMPLATE_TOKEN_LENGTH, 256))())
fastvideo.configs.pipelines.hunyuan15.Hunyuan15SR1080PConfig
dataclass
¶Hunyuan15SR1080PConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: int = 7, flow_shift_sr: int = 2, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = HunyuanVideo15Config(), dit_precision: str = 'bf16', upsampler_config: tuple[UpsamplerConfig, ...] = (lambda: (SRTo720pUpsamplerConfig(), SRTo1080pUpsamplerConfig()))(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = Hunyuan15VAEConfig(), vae_precision: str = 'fp16', vae_tiling: bool = True, vae_sp: bool = True, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (Qwen2_5_VLConfig(), T5Config()))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('bf16', 'fp32'))(), preprocess_text_funcs: tuple[Callable[[Any], Any], ...] = (lambda: (qwen_preprocess_text, byt5_preprocess_text))(), postprocess_text_funcs: tuple[Callable[..., Any], ...] = (lambda: (qwen_postprocess_text, byt5_postprocess_text))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None, text_encoder_crop_start: int = PROMPT_TEMPLATE_TOKEN_LENGTH, text_encoder_max_lengths: tuple[int, ...] = (lambda: (1000 + PROMPT_TEMPLATE_TOKEN_LENGTH, 256))())
fastvideo.configs.pipelines.hunyuan15.Hunyuan15T2V480PConfig
dataclass
¶Hunyuan15T2V480PConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: int = 5, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = HunyuanVideo15Config(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = Hunyuan15VAEConfig(), vae_precision: str = 'fp16', vae_tiling: bool = True, vae_sp: bool = True, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (Qwen2_5_VLConfig(), T5Config()))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('bf16', 'fp32'))(), preprocess_text_funcs: tuple[Callable[[Any], Any], ...] = (lambda: (qwen_preprocess_text, byt5_preprocess_text))(), postprocess_text_funcs: tuple[Callable[..., Any], ...] = (lambda: (qwen_postprocess_text, byt5_postprocess_text))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None, text_encoder_crop_start: int = PROMPT_TEMPLATE_TOKEN_LENGTH, text_encoder_max_lengths: tuple[int, ...] = (lambda: (1000 + PROMPT_TEMPLATE_TOKEN_LENGTH, 256))())
fastvideo.configs.pipelines.hunyuan15.Hunyuan15T2V720PConfig
dataclass
¶Hunyuan15T2V720PConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: int = 9, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = HunyuanVideo15Config(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = Hunyuan15VAEConfig(), vae_precision: str = 'fp16', vae_tiling: bool = True, vae_sp: bool = True, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (Qwen2_5_VLConfig(), T5Config()))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('bf16', 'fp32'))(), preprocess_text_funcs: tuple[Callable[[Any], Any], ...] = (lambda: (qwen_preprocess_text, byt5_preprocess_text))(), postprocess_text_funcs: tuple[Callable[..., Any], ...] = (lambda: (qwen_postprocess_text, byt5_postprocess_text))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None, text_encoder_crop_start: int = PROMPT_TEMPLATE_TOKEN_LENGTH, text_encoder_max_lengths: tuple[int, ...] = (lambda: (1000 + PROMPT_TEMPLATE_TOKEN_LENGTH, 256))())
Functions¶
fastvideo.configs.pipelines.hunyuan15.extract_glyph_texts
¶Extract glyph texts from prompt using regex pattern.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
prompt
|
str
|
Input prompt string |
required |
Returns:
| Type | Description |
|---|---|
str | None
|
List of extracted glyph texts |
Source code in fastvideo/configs/pipelines/hunyuan15.py
fastvideo.configs.pipelines.hunyuan15.format_text_input
¶Apply text to template.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
prompt
|
List[str]
|
Input text. |
required |
system_message
|
str
|
System message. |
required |
Returns:
| Type | Description |
|---|---|
list[dict[str, Any]]
|
List[Dict[str, Any]]: List of chat conversation. |
Source code in fastvideo/configs/pipelines/hunyuan15.py
fastvideo.configs.pipelines.hunyuangamecraft
¶
Pipeline configuration for HunyuanGameCraft.
HunyuanGameCraft extends HunyuanVideo with: 1. CameraNet for camera/action conditioning (Plücker coordinates) 2. Mask-based conditioning for autoregressive generation 3. 33 input channels (16 latent + 16 gt_latent + 1 mask)
Text encoders are the same as HunyuanVideo: - LLaVA-LLaMA-3-8B for primary text encoding (4096 dim) - CLIP ViT-L/14 for secondary pooled embeddings (768 dim)
Classes¶
fastvideo.configs.pipelines.hunyuangamecraft.HunyuanGameCraftPipelineConfig
dataclass
¶HunyuanGameCraftPipelineConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: int = 5, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = HunyuanGameCraftConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = GameCraftVAEConfig(), vae_precision: str = 'fp16', vae_tiling: bool = True, vae_sp: bool = True, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (LlamaConfig(), CLIPTextConfig()))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp16', 'fp16'))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (llama_preprocess_text, clip_preprocess_text))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (llama_postprocess_text, clip_postprocess_text))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None)
Bases: PipelineConfig
Configuration for HunyuanGameCraft pipeline.
Inherits text encoding from HunyuanVideo but uses: - GameCraft DiT with CameraNet - Same VAE (HunyuanVAE) - Same text encoders (LLaMA + CLIP)
fastvideo.configs.pipelines.hyworld
¶
Classes¶
fastvideo.configs.pipelines.hyworld.HYWorldConfig
dataclass
¶HYWorldConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: int = 5, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = HYWorldConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = Hunyuan15VAEConfig(), vae_precision: str = 'fp16', vae_tiling: bool = True, vae_sp: bool = True, image_encoder_config: EncoderConfig = SiglipVisionConfig(), image_encoder_precision: str = 'fp16', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (Qwen2_5_VLConfig(), T5Config()))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp16', 'fp32'))(), preprocess_text_funcs: tuple[Callable[[Any], Any], ...] = (lambda: (qwen_preprocess_text, byt5_preprocess_text))(), postprocess_text_funcs: tuple[Callable[..., Any], ...] = (lambda: (qwen_postprocess_text, byt5_postprocess_text))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None, text_encoder_crop_start: int = PROMPT_TEMPLATE_TOKEN_LENGTH, text_encoder_max_lengths: tuple[int, ...] = (lambda: (1000 + PROMPT_TEMPLATE_TOKEN_LENGTH, 256))())
fastvideo.configs.pipelines.longcat
¶
Classes¶
fastvideo.configs.pipelines.longcat.LongCatDiTArchConfig
dataclass
¶LongCatDiTArchConfig(stacked_params_mapping: list[tuple[str, str, str]] = list(), _fsdp_shard_conditions: list = list(), _compile_conditions: list = list(), param_names_mapping: dict = dict(), reverse_param_names_mapping: dict = dict(), lora_param_names_mapping: dict = dict(), _supported_attention_backends: tuple[AttentionBackendEnum, ...] = (SAGE_ATTN, FLASH_ATTN, TORCH_SDPA, VIDEO_SPARSE_ATTN, VMOBA_ATTN, SAGE_ATTN_THREE, SLA_ATTN, SAGE_SLA_ATTN), hidden_size: int = 0, num_attention_heads: int = 0, num_channels_latents: int = 0, in_channels: int = 16, out_channels: int = 16, exclude_lora_layers: list[str] = list(), boundary_ratio: float | None = None, adaln_tembed_dim: int = 512, caption_channels: int = 4096, depth: int = 48, enable_bsa: bool = False, enable_flashattn3: bool = False, enable_flashattn2: bool = True, enable_xformers: bool = False, frequency_embedding_size: int = 256, mlp_ratio: int = 4, num_heads: int = 32, text_tokens_zero_pad: bool = True, patch_size: list[int] = (lambda: [1, 2, 2])(), cp_split_hw: list[int] | None = None, bsa_params: dict | None = None)
Bases: DiTArchConfig
Extended DiTArchConfig with LongCat-specific fields.
fastvideo.configs.pipelines.longcat.LongCatT2V480PConfig
dataclass
¶LongCatT2V480PConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = None, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = (lambda: DiTConfig(arch_config=LongCatDiTArchConfig()))(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = WanVAEConfig(), vae_precision: str = 'bf16', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[T5Config, ...] = (lambda: (LongCatT5Config(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('bf16',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (longcat_preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (umt5_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None, enable_kv_cache: bool = True, offload_kv_cache: bool = False, enable_bsa: bool = False, use_distill: bool = False, enhance_hf: bool = False, bsa_params: dict | None = None, bsa_sparsity: float | None = None, bsa_cdf_threshold: float | None = None, bsa_chunk_q: list[int] | None = None, bsa_chunk_k: list[int] | None = None, t_thresh: float | None = None)
Bases: PipelineConfig
Configuration for LongCat pipeline (480p).
Components expected by loaders
- tokenizer: AutoTokenizer
- text_encoder: UMT5EncoderModel
- transformer: LongCatTransformer3DModel
- vae: AutoencoderKLWan (Wan VAE, 4x8 compression)
- scheduler: FlowMatchEulerDiscreteScheduler
fastvideo.configs.pipelines.longcat.LongCatT2V704PConfig
dataclass
¶LongCatT2V704PConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = None, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = (lambda: DiTConfig(arch_config=LongCatDiTArchConfig()))(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = WanVAEConfig(), vae_precision: str = 'bf16', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[T5Config, ...] = (lambda: (LongCatT5Config(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('bf16',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (longcat_preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (umt5_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None, enable_kv_cache: bool = True, offload_kv_cache: bool = False, enable_bsa: bool = True, use_distill: bool = False, enhance_hf: bool = False, bsa_params: dict | None = None, bsa_sparsity: float | None = None, bsa_cdf_threshold: float | None = None, bsa_chunk_q: list[int] | None = None, bsa_chunk_k: list[int] | None = None, t_thresh: float | None = None)
Bases: LongCatT2V480PConfig
Configuration for LongCat pipeline (704p) with BSA enabled by default.
Uses the same resolution and BSA parameters as original LongCat refinement stage.
BSA parameters configured in transformer config.json with chunk_3d_shape=[4,4,4]:
- Input: 704×1280×96
- VAE (8x): 88×160×96
- Patch [1,2,2]: 44×80×96
- chunk [4,4,4]: 96%4=0, 44%4=0, 80%4=0 ✅
This configuration matches the original LongCat refinement stage parameters.
fastvideo.configs.pipelines.longcat.LongCatT5ArchConfig
dataclass
¶LongCatT5ArchConfig(stacked_params_mapping: list[tuple[str, str, str]] = (lambda: [('.qkv_proj', '.q', 'q'), ('.qkv_proj', '.k', 'k'), ('.qkv_proj', '.v', 'v')])(), architectures: list[str] = (lambda: [])(), _supported_attention_backends: tuple[AttentionBackendEnum, ...] = (FLASH_ATTN, TORCH_SDPA), output_hidden_states: bool = False, use_return_dict: bool = True, vocab_size: int = 32128, hidden_size: int = 0, num_hidden_layers: int = 0, num_attention_heads: int = 0, pad_token_id: int = 0, eos_token_id: int = 1, text_len: int = 512, hidden_state_skip_layer: int = 0, decoder_start_token_id: int = 0, output_past: bool = True, scalable_attention: bool = True, tie_word_embeddings: bool = False, tokenizer_kwargs: dict[str, Any] = dict(), _fsdp_shard_conditions: list = (lambda: [_is_transformer_layer, _is_embeddings, _is_final_layernorm])(), d_model: int = 512, d_kv: int = 64, d_ff: int = 2048, num_layers: int = 6, num_decoder_layers: int | None = None, num_heads: int = 8, relative_attention_num_buckets: int = 32, relative_attention_max_distance: int = 128, dropout_rate: float = 0.1, layer_norm_epsilon: float = 1e-06, initializer_factor: float = 1.0, feed_forward_proj: str = 'relu', dense_act_fn: str = '', is_gated_act: bool = False, is_encoder_decoder: bool = True, use_cache: bool = True, classifier_dropout: float = 0.0, dtype: str | None = None, gradient_checkpointing: bool = False)
Bases: T5ArchConfig
T5 arch that pads tokenizer output to max_length.
LongCat's denoising stage concatenates positive and negative
attention masks along the batch dimension for CFG, which requires
uniform seq length. The shared :class:T5ArchConfig dropped the
"padding": "max_length" tokenizer kwarg so other DiTs could run
with variable-length masks; LongCat still needs the uniform
contract.
Functions¶
fastvideo.configs.pipelines.longcat.longcat_preprocess_text
¶Clean and preprocess text like original LongCat implementation.
This function applies the same text cleaning pipeline as the original LongCat-Video implementation to ensure identical tokenization results.
Steps: 1. basic_clean: Fix unicode issues and unescape HTML entities 2. whitespace_clean: Normalize whitespace to single spaces
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
prompt
|
str
|
Raw input text prompt |
required |
Returns:
| Type | Description |
|---|---|
str
|
Cleaned and normalized text prompt |
Source code in fastvideo/configs/pipelines/longcat.py
fastvideo.configs.pipelines.longcat.umt5_postprocess_text
¶Postprocess UMT5/T5 encoder outputs to fixed length 512 embeddings.
Source code in fastvideo/configs/pipelines/longcat.py
fastvideo.configs.pipelines.turbodiffusion
¶
TurboDiffusion pipeline configurations.
TurboDiffusion uses RCM (recurrent Consistency Model) scheduler with SLA (Sparse-Linear Attention) for fast 1-4 step video generation.
Classes¶
fastvideo.configs.pipelines.turbodiffusion.TurboDiffusionI2VConfig
dataclass
¶TurboDiffusionI2VConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = 5.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = WanVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = WanVAEConfig(), vae_precision: str = 'fp32', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = CLIPVisionConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (T5Config(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp32',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (t5_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = 0.9, precision: str = 'bf16', warp_denoising_step: bool = True)
Bases: PipelineConfig
Base configuration for TurboDiffusion I2V pipeline.
Uses RCM scheduler with sigma_max=200 for 1-4 step generation. Uses boundary_ratio=0.9 for high-noise to low-noise model switching.
fastvideo.configs.pipelines.turbodiffusion.TurboDiffusionI2V_A14B_Config
dataclass
¶TurboDiffusionI2V_A14B_Config(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = 5.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = WanVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = WanVAEConfig(), vae_precision: str = 'fp32', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = CLIPVisionConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (T5Config(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp32',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (t5_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = 0.9, precision: str = 'bf16', warp_denoising_step: bool = True)
fastvideo.configs.pipelines.turbodiffusion.TurboDiffusionT2VConfig
dataclass
¶TurboDiffusionT2VConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = 3.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = WanVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = WanVAEConfig(), vae_precision: str = 'fp32', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (T5Config(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp32',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (t5_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None, precision: str = 'bf16', warp_denoising_step: bool = True)
Bases: PipelineConfig
Base configuration for TurboDiffusion T2V pipeline.
Uses RCM scheduler with sigma_max=80 for 1-4 step generation. No boundary_ratio (single model, no switching).
fastvideo.configs.pipelines.turbodiffusion.TurboDiffusionT2V_14B_Config
dataclass
¶TurboDiffusionT2V_14B_Config(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = 5.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = WanVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = WanVAEConfig(), vae_precision: str = 'fp32', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (T5Config(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp32',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (t5_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None, precision: str = 'bf16', warp_denoising_step: bool = True)
Bases: TurboDiffusionT2VConfig
Configuration for TurboDiffusion T2V 14B model.
Uses same config as 1.3B but with higher flow_shift for 14B model.
fastvideo.configs.pipelines.turbodiffusion.TurboDiffusionT2V_1_3B_Config
dataclass
¶TurboDiffusionT2V_1_3B_Config(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = 3.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = WanVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = WanVAEConfig(), vae_precision: str = 'fp32', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (T5Config(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp32',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (t5_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None, precision: str = 'bf16', warp_denoising_step: bool = True)
fastvideo.configs.pipelines.wan
¶
Classes¶
fastvideo.configs.pipelines.wan.FastWan2_1_T2V_480P_Config
dataclass
¶FastWan2_1_T2V_480P_Config(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = 8.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = WanVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = WanVAEConfig(), vae_precision: str = 'fp32', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (T5Config(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp32',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (t5_postprocess_text,))(), dmd_denoising_steps: list[int] | None = (lambda: [1000, 757, 522])(), ti2v_task: bool = False, boundary_ratio: float | None = None, precision: str = 'bf16', warp_denoising_step: bool = True)
fastvideo.configs.pipelines.wan.WANV2VConfig
dataclass
¶WANV2VConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = 3.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = WanVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = WanVAEConfig(), vae_precision: str = 'fp32', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = WAN2_1ControlCLIPVisionConfig(), image_encoder_precision: str = 'bf16', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (T5Config(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp32',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (t5_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None, precision: str = 'bf16', warp_denoising_step: bool = True)
fastvideo.configs.pipelines.wan.WanI2V480PConfig
dataclass
¶WanI2V480PConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = 3.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = WanVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = WanVAEConfig(), vae_precision: str = 'fp32', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = CLIPVisionConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (T5Config(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp32',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (t5_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None, precision: str = 'bf16', warp_denoising_step: bool = True)
fastvideo.configs.pipelines.wan.WanI2V720PConfig
dataclass
¶WanI2V720PConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = 5.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = WanVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = WanVAEConfig(), vae_precision: str = 'fp32', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = CLIPVisionConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (T5Config(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp32',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (t5_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None, precision: str = 'bf16', warp_denoising_step: bool = True)
fastvideo.configs.pipelines.wan.WanT2V480PConfig
dataclass
¶WanT2V480PConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = 3.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = WanVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = WanVAEConfig(), vae_precision: str = 'fp32', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (T5Config(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp32',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (t5_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None, precision: str = 'bf16', warp_denoising_step: bool = True)
fastvideo.configs.pipelines.wan.WanT2V720PConfig
dataclass
¶WanT2V720PConfig(model_path: str = '', pipeline_config_path: str | None = None, embedded_cfg_scale: float = 6.0, flow_shift: float | None = 5.0, flow_shift_sr: float | None = None, disable_autocast: bool = False, is_causal: bool = False, dit_config: DiTConfig = WanVideoConfig(), dit_precision: str = 'bf16', upsampler_config: UpsamplerConfig = UpsamplerConfig(), upsampler_precision: str = 'fp32', vae_config: VAEConfig = WanVAEConfig(), vae_precision: str = 'fp32', vae_tiling: bool = False, vae_sp: bool = False, image_encoder_config: EncoderConfig = EncoderConfig(), image_encoder_precision: str = 'fp32', text_encoder_configs: tuple[EncoderConfig, ...] = (lambda: (T5Config(),))(), text_encoder_precisions: tuple[str, ...] = (lambda: ('fp32',))(), preprocess_text_funcs: tuple[Callable[[str], str], ...] = (lambda: (preprocess_text,))(), postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], Tensor], ...] = (lambda: (t5_postprocess_text,))(), dmd_denoising_steps: list[int] | None = None, ti2v_task: bool = False, boundary_ratio: float | None = None, precision: str = 'bf16', warp_denoising_step: bool = True)
fastvideo.configs.utils
¶
Functions¶
fastvideo.configs.utils.clean_cli_args
¶
Clean the arguments by removing the ones that not explicitly provided by the user.
Source code in fastvideo/configs/utils.py
fastvideo.configs.utils.update_config_from_args
¶
update_config_from_args(config: Any, args_dict: dict[str, Any], prefix: str = '', pop_args: bool = False) -> bool
Update configuration object from arguments dictionary.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
config
|
Any
|
The configuration object to update |
required |
args_dict
|
dict[str, Any]
|
Dictionary containing arguments |
required |
prefix
|
str
|
Prefix for the configuration parameters in the args_dict. If None, assumes direct attribute mapping without prefix. |
''
|