Skip to content

gen3c

Classes

fastvideo.configs.models.dits.gen3c.Gen3CArchConfig dataclass

Gen3CArchConfig(stacked_params_mapping: list[tuple[str, str, str]] = list(), _fsdp_shard_conditions: list = (lambda: [is_transformer_blocks])(), _compile_conditions: list = list(), param_names_mapping: dict = (lambda: {'^net\\.x_embedder\\.proj\\.1\\.(.*)$': 'patch_embed.proj.\\1', '^net\\.t_embedder\\.0\\.(.*)$': 'time_embed.time_proj.\\1', '^net\\.t_embedder\\.1\\.linear_1\\.(.*)$': 'time_embed.t_embedder.linear_1.\\1', '^net\\.t_embedder\\.1\\.linear_2\\.(.*)$': 'time_embed.t_embedder.linear_2.\\1', '^net\\.augment_sigma_embedder\\.0\\.(.*)$': 'augment_sigma_embed.time_proj.\\1', '^net\\.augment_sigma_embedder\\.1\\.linear_1\\.(.*)$': 'augment_sigma_embed.t_embedder.linear_1.\\1', '^net\\.augment_sigma_embedder\\.1\\.linear_2\\.(.*)$': 'augment_sigma_embed.t_embedder.linear_2.\\1', '^net\\.affline_norm\\.(.*)$': 'affine_norm.\\1', '^net\\.extra_pos_embedder\\.pos_emb_t$': 'learnable_pos_embed.pos_emb_t', '^net\\.extra_pos_embedder\\.pos_emb_h$': 'learnable_pos_embed.pos_emb_h', '^net\\.extra_pos_embedder\\.pos_emb_w$': 'learnable_pos_embed.pos_emb_w', '^net\\.blocks\\.block(\\d+)\\.blocks\\.0\\.block\\.attn\\.to_q\\.0\\.(.*)$': 'transformer_blocks.\\1.attn1.to_q.\\2', '^net\\.blocks\\.block(\\d+)\\.blocks\\.0\\.block\\.attn\\.to_q\\.1\\.(.*)$': 'transformer_blocks.\\1.attn1.norm_q.\\2', '^net\\.blocks\\.block(\\d+)\\.blocks\\.0\\.block\\.attn\\.to_k\\.0\\.(.*)$': 'transformer_blocks.\\1.attn1.to_k.\\2', '^net\\.blocks\\.block(\\d+)\\.blocks\\.0\\.block\\.attn\\.to_k\\.1\\.(.*)$': 'transformer_blocks.\\1.attn1.norm_k.\\2', '^net\\.blocks\\.block(\\d+)\\.blocks\\.0\\.block\\.attn\\.to_v\\.0\\.(.*)$': 'transformer_blocks.\\1.attn1.to_v.\\2', '^net\\.blocks\\.block(\\d+)\\.blocks\\.0\\.block\\.attn\\.to_out\\.0\\.(.*)$': 'transformer_blocks.\\1.attn1.to_out.\\2', '^net\\.blocks\\.block(\\d+)\\.blocks\\.0\\.adaLN_modulation\\.(.*)$': 'transformer_blocks.\\1.adaln_modulation_self_attn.\\2', '^net\\.blocks\\.block(\\d+)\\.blocks\\.1\\.block\\.attn\\.to_q\\.0\\.(.*)$': 'transformer_blocks.\\1.attn2.to_q.\\2', '^net\\.blocks\\.block(\\d+)\\.blocks\\.1\\.block\\.attn\\.to_q\\.1\\.(.*)$': 'transformer_blocks.\\1.attn2.norm_q.\\2', '^net\\.blocks\\.block(\\d+)\\.blocks\\.1\\.block\\.attn\\.to_k\\.0\\.(.*)$': 'transformer_blocks.\\1.attn2.to_k.\\2', '^net\\.blocks\\.block(\\d+)\\.blocks\\.1\\.block\\.attn\\.to_k\\.1\\.(.*)$': 'transformer_blocks.\\1.attn2.norm_k.\\2', '^net\\.blocks\\.block(\\d+)\\.blocks\\.1\\.block\\.attn\\.to_v\\.0\\.(.*)$': 'transformer_blocks.\\1.attn2.to_v.\\2', '^net\\.blocks\\.block(\\d+)\\.blocks\\.1\\.block\\.attn\\.to_out\\.0\\.(.*)$': 'transformer_blocks.\\1.attn2.to_out.\\2', '^net\\.blocks\\.block(\\d+)\\.blocks\\.1\\.adaLN_modulation\\.(.*)$': 'transformer_blocks.\\1.adaln_modulation_cross_attn.\\2', '^net\\.blocks\\.block(\\d+)\\.blocks\\.2\\.block\\.layer1\\.(.*)$': 'transformer_blocks.\\1.mlp.fc_in.\\2', '^net\\.blocks\\.block(\\d+)\\.blocks\\.2\\.block\\.layer2\\.(.*)$': 'transformer_blocks.\\1.mlp.fc_out.\\2', '^net\\.blocks\\.block(\\d+)\\.blocks\\.2\\.adaLN_modulation\\.(.*)$': 'transformer_blocks.\\1.adaln_modulation_mlp.\\2', '^net\\.final_layer\\.linear\\.(.*)$': 'final_layer.proj_out.\\1', '^net\\.final_layer\\.adaLN_modulation\\.(.*)$': 'final_layer.adaln_modulation.\\1'})(), reverse_param_names_mapping: dict = dict(), lora_param_names_mapping: dict = (lambda: {'^transformer_blocks\\.(\\d+)\\.attn1\\.to_q\\.(.*)$': 'transformer_blocks.\\1.attn1.to_q.\\2', '^transformer_blocks\\.(\\d+)\\.attn1\\.to_k\\.(.*)$': 'transformer_blocks.\\1.attn1.to_k.\\2', '^transformer_blocks\\.(\\d+)\\.attn1\\.to_v\\.(.*)$': 'transformer_blocks.\\1.attn1.to_v.\\2', '^transformer_blocks\\.(\\d+)\\.attn1\\.to_out\\.(.*)$': 'transformer_blocks.\\1.attn1.to_out.\\2', '^transformer_blocks\\.(\\d+)\\.attn2\\.to_q\\.(.*)$': 'transformer_blocks.\\1.attn2.to_q.\\2', '^transformer_blocks\\.(\\d+)\\.attn2\\.to_k\\.(.*)$': 'transformer_blocks.\\1.attn2.to_k.\\2', '^transformer_blocks\\.(\\d+)\\.attn2\\.to_v\\.(.*)$': 'transformer_blocks.\\1.attn2.to_v.\\2', '^transformer_blocks\\.(\\d+)\\.attn2\\.to_out\\.(.*)$': 'transformer_blocks.\\1.attn2.to_out.\\2', '^transformer_blocks\\.(\\d+)\\.mlp\\.(.*)$': 'transformer_blocks.\\1.mlp.\\2'})(), _supported_attention_backends: tuple[AttentionBackendEnum, ...] = (SAGE_ATTN, FLASH_ATTN, TORCH_SDPA, VIDEO_SPARSE_ATTN, VMOBA_ATTN, SAGE_ATTN_THREE, SLA_ATTN, SAGE_SLA_ATTN), hidden_size: int = 0, num_attention_heads: int = 32, num_channels_latents: int = 0, in_channels: int = 16, out_channels: int = 16, exclude_lora_layers: list[str] = (lambda: ['embedder'])(), boundary_ratio: float | None = None, CHANNELS_PER_BUFFER: int = 32, frame_buffer_max: int = 2, attention_head_dim: int = 128, num_layers: int = 28, mlp_ratio: float = 4.0, text_embed_dim: int = 1024, adaln_lora_dim: int = 256, use_adaln_lora: bool = True, add_augment_sigma_embedding: bool = False, max_size: tuple[int, int, int] = (128, 240, 240), patch_size: tuple[int, int, int] = (1, 2, 2), rope_scale: tuple[float, float, float] = (2.0, 1.0, 1.0), extra_pos_embed_type: str = 'learnable', concat_padding_mask: bool = True, use_crossattn_projection: bool = False, rope_enable_fps_modulation: bool = True, qk_norm: str = 'rms_norm', eps: float = 1e-06, affine_emb_norm: bool = True, block_x_format: str = 'THWBD')

Bases: DiTArchConfig

Configuration for GEN3C architecture (VideoExtendGeneralDIT).

fastvideo.configs.models.dits.gen3c.Gen3CVideoConfig dataclass

Gen3CVideoConfig(arch_config: DiTArchConfig = Gen3CArchConfig(), prefix: str = 'Gen3C', quant_config: QuantizationConfig | None = None)

Bases: DiTConfig

Configuration for GEN3C video generation model.