AnimateDiff-XL
484 lines
UNetMotionModel(
UNet3DConditionModel(
(conv_in): Conv2d(4, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv_in): Conv2d(4, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(time_proj): Timesteps()
(time_proj): Timesteps()
(time_embedding): TimestepEmbedding(
(time_embedding): TimestepEmbedding(
(linear_1): LoRACompatibleLinear(in_features=320, out_features=1280, bias=True)
(linear_1): LoRACompatibleLinear(in_features=320, out_features=1280, bias=True)
(act): SiLU()
(act): SiLU()
(linear_2): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
(linear_2): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
)
)
(add_time_proj): Timesteps()
(add_time_proj): Timesteps()
(add_embedding): TimestepEmbedding(
(add_embedding): TimestepEmbedding(
(linear_1): LoRACompatibleLinear(in_features=2816, out_features=1280, bias=True)
(linear_1): LoRACompatibleLinear(in_features=2816, out_features=1280, bias=True)
(act): SiLU()
(act): SiLU()
(linear_2): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
(linear_2): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
)
)
(down_blocks): ModuleList(
(down_blocks): ModuleList(
(0): DownBlockMotion(
(0): DownBlock3D(
(resnets): ModuleList(
(resnets): ModuleList(
(0-1): 2 x ResnetBlock2D(
(0-1): 2 x ResnetBlock2D(
(norm1): GroupNorm(32, 320, eps=1e-05, affine=True)
(norm1): GroupNorm(32, 320, eps=1e-05, affine=True)
(conv1): LoRACompatibleConv(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv1): LoRACompatibleConv(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=320, bias=True)
(time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=320, bias=True)
(norm2): GroupNorm(32, 320, eps=1e-05, affine=True)
(norm2): GroupNorm(32, 320, eps=1e-05, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): LoRACompatibleConv(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv2): LoRACompatibleConv(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(nonlinearity): SiLU()
(nonlinearity): SiLU()
)
)
)
)
(motion_modules): ModuleList(
(motion_modules): ModuleList(
(0-1): 2 x TransformerTemporalModel(
(0-1): 2 x VanillaTemporalModule(
(norm): GroupNorm(32, 320, eps=1e-06, affine=True)
(temporal_transformer): TemporalTransformer3DModel(
(proj_in): Linear(in_features=320, out_features=320, bias=True)
(norm): GroupNorm(32, 320, eps=1e-06, affine=True)
(transformer_blocks): ModuleList(
(proj_in): Linear(in_features=320, out_features=320, bias=True)
(0): BasicTransformerBlock(
(transformer_blocks): ModuleList(
(pos_embed): SinusoidalPositionalEmbedding()
(0): TemporalTransformerBlock(
(norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
(attention_blocks): ModuleList(
(attn1): Attention(
(0-1): 2 x TemporalSelfAttention(
(to_q): LoRACompatibleLinear(in_features=320, out_features=320, bias=False)
(to_q): LoRACompatibleLinear(in_features=320, out_features=320, bias=False)
(to_k): LoRACompatibleLinear(in_features=320, out_features=320, bias=False)
(to_k): LoRACompatibleLinear(in_features=320, out_features=320, bias=False)
(to_v): LoRACompatibleLinear(in_features=320, out_features=320, bias=False)
(to_v): LoRACompatibleLinear(in_features=320, out_features=320, bias=False)
(to_out): ModuleList(
(to_out): ModuleList(
(0): LoRACompatibleLinear(in_features=320, out_features=320, bias=True)
(0): LoRACompatibleLinear(in_features=320, out_features=320, bias=True)
(1): Dropout(p=0.0, inplace=False)
(1): Dropout(p=0.0, inplace=False)
)
(pos_encoder): PositionalEncoding(
(dropout): Dropout(p=0.0, inplace=False)
)
)
)
)
)
(norms): ModuleList(
(norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
(0-1): 2 x LayerNorm((320,), eps=1e-05, elementwise_affine=True)
(attn2): Attention(
(to_q): LoRACompatibleLinear(in_features=320, out_features=320, bias=False)
(to_k): LoRACompatibleLinear(in_features=320, out_features=320, bias=False)
(to_v): LoRACompatibleLinear(in_features=320, out_features=320, bias=False)
(to_out): ModuleList(
(0): LoRACompatibleLinear(in_features=320, out_features=320, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
)
(ff): FeedForward(
(norm3): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
(net): ModuleList(
(ff): FeedForward(
(0): GEGLU(
(net): ModuleList(
(proj): LoRACompatibleLinear(in_features=320, out_features=2560, bias=True)
(0): GEGLU(
)
(proj): LoRACompatibleLinear(in_features=320, out_features=2560, bias=True)
(1): Dropout(p=0.0, inplace=False)
(2): LoRACompatibleLinear(in_features=1280, out_features=320, bias=True)
)
)
(1): Dropout(p=0.0, inplace=False)
(2): LoRACompatibleLinear(in_features=1280, out_features=320, bias=True)
)
)
(ff_norm): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
)
)
)
)
(proj_out): Linear(in_features=320, out_features=320, bias=True)
)
)
(proj_out): Linear(in_features=320, out_features=320, bias=True)
)
)
)
)
(downsamplers): ModuleList(
(downsamplers): ModuleList(
(0): Downsample2D(
(0): Downsample2D(
(conv): LoRACompatibleConv(320, 320, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
(conv): LoRACompatibleConv(320, 320, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
)
)
)
)
)
)
(1): CrossAttnDownBlockMotion(
(1): CrossAttnDownBlock3D(
(attentions): ModuleList(
(attentions): ModuleList(
(0-1): 2 x Transformer2DModel(
(0-1): 2 x Transformer2DModel(
(norm): GroupNorm(32, 640, eps=1e-06, affine=True)
(norm): GroupNorm(32, 640, eps=1e-06, affine=True)
(proj_in): LoRACompatibleLinear(in_features=640, out_features=640, bias=True)
(proj_in): LoRACompatibleLinear(in_features=640, out_features=640, bias=True)
(transformer_blocks): ModuleList(
(transformer_blocks): ModuleList(
(0-1): 2 x BasicTransformerBlock(
(0-1): 2 x BasicTransformerBlock(
(norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
(norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
(attn1): Attention(
(attn1): Attention(
(to_q): LoRACompatibleLinear(in_features=640, out_features=640, bias=False)
(to_q): LoRACompatibleLinear(in_features=640, out_features=640, bias=False)
(to_k): LoRACompatibleLinear(in_features=640, out_features=640, bias=False)
(to_k): LoRACompatibleLinear(in_features=640, out_features=640, bias=False)
(to_v): LoRACompatibleLinear(in_features=640, out_features=640, bias=False)
(to_v): LoRACompatibleLinear(in_features=640, out_features=640, bias=False)
(to_out): ModuleList(
(to_out): ModuleList(
(0): LoRACompatibleLinear(in_features=640, out_features=640, bias=True)
(0): LoRACompatibleLinear(in_features=640, out_features=640, bias=True)
(1): Dropout(p=0.0, inplace=False)
(1): Dropout(p=0.0, inplace=False)
)
)
)
)
(norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
(norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
(attn2): Attention(
(attn2): Attention(
(to_q): LoRACompatibleLinear(in_features=640, out_features=640, bias=False)
(to_q): LoRACompatibleLinear(in_features=640, out_features=640, bias=False)
(to_k): LoRACompatibleLinear(in_features=2048, out_features=640, bias=False)
(to_k): LoRACompatibleLinear(in_features=2048, out_features=640, bias=False)
(to_v): LoRACompatibleLinear(in_features=2048, out_features=640, bias=False)
(to_v): LoRACompatibleLinear(in_features=2048, out_features=640, bias=False)
(to_out): ModuleList(
(to_out): ModuleList(
(0): LoRACompatibleLinear(in_features=640, out_features=640, bias=True)
(0): LoRACompatibleLinear(in_features=640, out_features=640, bias=True)
(1): Dropout(p=0.0, inplace=False)
(1): Dropout(p=0.0, inplace=False)
)
)
)
)
(norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
(norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
(ff): FeedForward(
(ff): FeedForward(
(net): ModuleList(
(net): ModuleList(
(0): GEGLU(
(0): GEGLU(
(proj): LoRACompatibleLinear(in_features=640, out_features=5120, bias=True)
(proj): LoRACompatibleLinear(in_features=640, out_features=5120, bias=True)
)
)
(1): Dropout(p=0.0, inplace=False)
(1): Dropout(p=0.0, inplace=False)
(2): LoRACompatibleLinear(in_features=2560, out_features=640, bias=True)
(2): LoRACompatibleLinear(in_features=2560, out_features=640, bias=True)
)
)
)
)
)
)
)
)
(proj_out): LoRACompatibleLinear(in_features=640, out_features=640, bias=True)
(proj_out): LoRACompatibleLinear(in_features=640, out_features=640, bias=True)
)
)
)
)
(resnets): ModuleList(
(resnets): ModuleList(
(0): ResnetBlock2D(
(0): ResnetBlock2D(
(norm1): GroupNorm(32, 320, eps=1e-05, affine=True)
(norm1): GroupNorm(32, 320, eps=1e-05, affine=True)
(conv1): LoRACompatibleConv(320, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv1): LoRACompatibleConv(320, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=640, bias=True)
(time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=640, bias=True)
(norm2): GroupNorm(32, 640, eps=1e-05, affine=True)
(norm2): GroupNorm(32, 640, eps=1e-05, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): LoRACompatibleConv(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv2): LoRACompatibleConv(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(nonlinearity): SiLU()
(nonlinearity): SiLU()
(conv_shortcut): LoRACompatibleConv(320, 640, kernel_size=(1, 1), stride=(1, 1))
(conv_shortcut): LoRACompatibleConv(320, 640, kernel_size=(1, 1), stride=(1, 1))
)
)
(1): ResnetBlock2D(
(1): ResnetBlock2D(
(norm1): GroupNorm(32, 640, eps=1e-05, affine=True)
(norm1): GroupNorm(32, 640, eps=1e-05, affine=True)
(conv1): LoRACompatibleConv(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv1): LoRACompatibleConv(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=640, bias=True)
(time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=640, bias=True)
(norm2): GroupNorm(32, 640, eps=1e-05, affine=True)
(norm2): GroupNorm(32, 640, eps=1e-05, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): LoRACompatibleConv(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv2): LoRACompatibleConv(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(nonlinearity): SiLU()
(nonlinearity): SiLU()
)
)
)
)
(motion_modules): ModuleList(
(motion_modules): ModuleList(
(0-1): 2 x TransformerTemporalModel(
(0-1): 2 x VanillaTemporalModule(
(norm): GroupNorm(32, 640, eps=1e-06, affine=True)
(temporal_transformer): TemporalTransformer3DModel(
(proj_in): Linear(in_features=640, out_features=640, bias=True)
(norm): GroupNorm(32, 640, eps=1e-06, affine=True)
(transformer_blocks): ModuleList(
(proj_in): Linear(in_features=640, out_features=640, bias=True)
(0): BasicTransformerBlock(
(transformer_blocks): ModuleList(
(pos_embed): SinusoidalPositionalEmbedding()
(0): TemporalTransformerBlock(
(norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
(attention_blocks): ModuleList(
(attn1): Attention(
(0-1): 2 x TemporalSelfAttention(
(to_q): LoRACompatibleLinear(in_features=640, out_features=640, bias=False)
(to_q): LoRACompatibleLinear(in_features=640, out_features=640, bias=False)
(to_k): LoRACompatibleLinear(in_features=640, out_features=640, bias=False)
(to_k): LoRACompatibleLinear(in_features=640, out_features=640, bias=False)
(to_v): LoRACompatibleLinear(in_features=640, out_features=640, bias=False)
(to_v): LoRACompatibleLinear(in_features=640, out_features=640, bias=False)
(to_out): ModuleList(
(to_out): ModuleList(
(0): LoRACompatibleLinear(in_features=640, out_features=640, bias=True)
(0): LoRACompatibleLinear(in_features=640, out_features=640, bias=True)
(1): Dropout(p=0.0, inplace=False)
(1): Dropout(p=0.0, inplace=False)
)
(pos_encoder): PositionalEncoding(
(dropout): Dropout(p=0.0, inplace=False)
)
)
)
)
)
(norms): ModuleList(
(norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
(0-1): 2 x LayerNorm((640,), eps=1e-05, elementwise_affine=True)
(attn2): Attention(
(to_q): LoRACompatibleLinear(in_features=640, out_features=640, bias=False)
(to_k): LoRACompatibleLinear(in_features=640, out_features=640, bias=False)
(to_v): LoRACompatibleLinear(in_features=640, out_features=640, bias=False)
(to_out): ModuleList(
(0): LoRACompatibleLinear(in_features=640, out_features=640, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
)
(ff): FeedForward(
(norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
(net): ModuleList(
(ff): FeedForward(
(0): GEGLU(
(net): ModuleList(
(proj): LoRACompatibleLinear(in_features=640, out_features=5120, bias=True)
(0): GEGLU(
)
(proj): LoRACompatibleLinear(in_features=640, out_features=5120, bias=True)
(1): Dropout(p=0.0, inplace=False)
(2): LoRACompatibleLinear(in_features=2560, out_features=640, bias=True)
)
)
(1): Dropout(p=0.0, inplace=False)
(2): LoRACompatibleLinear(in_features=2560, out_features=640, bias=True)
)
)
(ff_norm): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
)
)
)
)
(proj_out): Linear(in_features=640, out_features=640, bias=True)
)
)
(proj_out): Linear(in_features=640, out_features=640, bias=True)
)
)
)
)
(downsamplers): ModuleList(
(downsamplers): ModuleList(
(0): Downsample2D(
(0): Downsample2D(
(conv): LoRACompatibleConv(640, 640, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
(conv): LoRACompatibleConv(640, 640, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
)
)
)
)
)
)
(2): CrossAttnDownBlockMotion(
(2): CrossAttnDownBlock3D(
(attentions): ModuleList(
(attentions): ModuleList(
(0-1): 2 x Transformer2DModel(
(0-1): 2 x Transformer2DModel(
(norm): GroupNorm(32, 1280, eps=1e-06, affine=True)
(norm): GroupNorm(32, 1280, eps=1e-06, affine=True)
(proj_in): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
(proj_in): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
(transformer_blocks): ModuleList(
(transformer_blocks): ModuleList(
(0-9): 10 x BasicTransformerBlock(
(0-9): 10 x BasicTransformerBlock(
(norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(attn1): Attention(
(attn1): Attention(
(to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_k): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_k): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_v): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_v): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_out): ModuleList(
(to_out): ModuleList(
(0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
(0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
(1): Dropout(p=0.0, inplace=False)
(1): Dropout(p=0.0, inplace=False)
)
)
)
)
(norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(attn2): Attention(
(attn2): Attention(
(to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_k): LoRACompatibleLinear(in_features=2048, out_features=1280, bias=False)
(to_k): LoRACompatibleLinear(in_features=2048, out_features=1280, bias=False)
(to_v): LoRACompatibleLinear(in_features=2048, out_features=1280, bias=False)
(to_v): LoRACompatibleLinear(in_features=2048, out_features=1280, bias=False)
(to_out): ModuleList(
(to_out): ModuleList(
(0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
(0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
(1): Dropout(p=0.0, inplace=False)
(1): Dropout(p=0.0, inplace=False)
)
)
)
)
(norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(ff): FeedForward(
(ff): FeedForward(
(net): ModuleList(
(net): ModuleList(
(0): GEGLU(
(0): GEGLU(
(proj): LoRACompatibleLinear(in_features=1280, out_features=10240, bias=True)
(proj): LoRACompatibleLinear(in_features=1280, out_features=10240, bias=True)
)
)
(1): Dropout(p=0.0, inplace=False)
(1): Dropout(p=0.0, inplace=False)
(2): LoRACompatibleLinear(in_features=5120, out_features=1280, bias=True)
(2): LoRACompatibleLinear(in_features=5120, out_features=1280, bias=True)
)
)
)
)
)
)
)
)
(proj_out): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
(proj_out): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
)
)
)
)
(resnets): ModuleList(
(resnets): ModuleList(
(0): ResnetBlock2D(
(0): ResnetBlock2D(
(norm1): GroupNorm(32, 640, eps=1e-05, affine=True)
(norm1): GroupNorm(32, 640, eps=1e-05, affine=True)
(conv1): LoRACompatibleConv(640, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv1): LoRACompatibleConv(640, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
(time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
(norm2): GroupNorm(32, 1280, eps=1e-05, affine=True)
(norm2): GroupNorm(32, 1280, eps=1e-05, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv2): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(nonlinearity): SiLU()
(nonlinearity): SiLU()
(conv_shortcut): LoRACompatibleConv(640, 1280, kernel_size=(1, 1), stride=(1, 1))
(conv_shortcut): LoRACompatibleConv(640, 1280, kernel_size=(1, 1), stride=(1, 1))
)
)
(1): ResnetBlock2D(
(1): ResnetBlock2D(
(norm1): GroupNorm(32, 1280, eps=1e-05, affine=True)
(norm1): GroupNorm(32, 1280, eps=1e-05, affine=True)
(conv1): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv1): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
(time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
(norm2): GroupNorm(32, 1280, eps=1e-05, affine=True)
(norm2): GroupNorm(32, 1280, eps=1e-05, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv2): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(nonlinearity): SiLU()
(nonlinearity): SiLU()
)
)
)
)
(motion_modules): ModuleList(
(motion_modules): ModuleList(
(0-1): 2 x TransformerTemporalModel(
(0-1): 2 x VanillaTemporalModule(
(norm): GroupNorm(32, 1280, eps=1e-06, affine=True)
(temporal_transformer): TemporalTransformer3DModel(
(proj_in): Linear(in_features=1280, out_features=1280, bias=True)
(norm): GroupNorm(32, 1280, eps=1e-06, affine=True)
(transformer_blocks): ModuleList(
(proj_in): Linear(in_features=1280, out_features=1280, bias=True)
(0): BasicTransformerBlock(
(transformer_blocks): ModuleList(
(pos_embed): SinusoidalPositionalEmbedding()
(0): TemporalTransformerBlock(
(norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(attention_blocks): ModuleList(
(attn1): Attention(
(0-1): 2 x TemporalSelfAttention(
(to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_k): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_k): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_v): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_v): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_out): ModuleList(
(to_out): ModuleList(
(0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
(0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
(1): Dropout(p=0.0, inplace=False)
(1): Dropout(p=0.0, inplace=False)
)
(pos_encoder): PositionalEncoding(
(dropout): Dropout(p=0.0, inplace=False)
)
)
)
)
)
(norms): ModuleList(
(norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(0-1): 2 x LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(attn2): Attention(
(to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_k): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_v): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_out): ModuleList(
(0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
)
(ff): FeedForward(
(norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(net): ModuleList(
(ff): FeedForward(
(0): GEGLU(
(net): ModuleList(
(proj): LoRACompatibleLinear(in_features=1280, out_features=10240, bias=True)
(0): GEGLU(
)
(proj): LoRACompatibleLinear(in_features=1280, out_features=10240, bias=True)
(1): Dropout(p=0.0, inplace=False)
(2): LoRACompatibleLinear(in_features=5120, out_features=1280, bias=True)
)
)
(1): Dropout(p=0.0, inplace=False)
(2): LoRACompatibleLinear(in_features=5120, out_features=1280, bias=True)
)
)
(ff_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
)
)
)
)
(proj_out): Linear(in_features=1280, out_features=1280, bias=True)
)
)
(proj_out): Linear(in_features=1280, out_features=1280, bias=True)
)
)
)
)
)
)
)
)
(up_blocks): ModuleList(
(up_blocks): ModuleList(
(0): CrossAttnUpBlockMotion(
(0): CrossAttnUpBlock3D(
(attentions): ModuleList(
(attentions): ModuleList(
(0-2): 3 x Transformer2DModel(
(0-2): 3 x Transformer2DModel(
(norm): GroupNorm(32, 1280, eps=1e-06, affine=True)
(norm): GroupNorm(32, 1280, eps=1e-06, affine=True)
(proj_in): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
(proj_in): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
(transformer_blocks): ModuleList(
(transformer_blocks): ModuleList(
(0-9): 10 x BasicTransformerBlock(
(0-9): 10 x BasicTransformerBlock(
(norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(attn1): Attention(
(attn1): Attention(
(to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_k): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_k): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_v): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_v): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_out): ModuleList(
(to_out): ModuleList(
(0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
(0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
(1): Dropout(p=0.0, inplace=False)
(1): Dropout(p=0.0, inplace=False)
)
)
)
)
(norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(attn2): Attention(
(attn2): Attention(
(to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_k): LoRACompatibleLinear(in_features=2048, out_features=1280, bias=False)
(to_k): LoRACompatibleLinear(in_features=2048, out_features=1280, bias=False)
(to_v): LoRACompatibleLinear(in_features=2048, out_features=1280, bias=False)
(to_v): LoRACompatibleLinear(in_features=2048, out_features=1280, bias=False)
(to_out): ModuleList(
(to_out): ModuleList(
(0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
(0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
(1): Dropout(p=0.0, inplace=False)
(1): Dropout(p=0.0, inplace=False)
)
)
)
)
(norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(ff): FeedForward(
(ff): FeedForward(
(net): ModuleList(
(net): ModuleList(
(0): GEGLU(
(0): GEGLU(
(proj): LoRACompatibleLinear(in_features=1280, out_features=10240, bias=True)
(proj): LoRACompatibleLinear(in_features=1280, out_features=10240, bias=True)
)
)
(1): Dropout(p=0.0, inplace=False)
(1): Dropout(p=0.0, inplace=False)
(2): LoRACompatibleLinear(in_features=5120, out_features=1280, bias=True)
(2): LoRACompatibleLinear(in_features=5120, out_features=1280, bias=True)
)
)
)
)
)
)
)
)
(proj_out): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
(proj_out): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
)
)
)
)
(resnets): ModuleList(
(resnets): ModuleList(
(0-1): 2 x ResnetBlock2D(
(0-1): 2 x ResnetBlock2D(
(norm1): GroupNorm(32, 2560, eps=1e-05, affine=True)
(norm1): GroupNorm(32, 2560, eps=1e-05, affine=True)
(conv1): LoRACompatibleConv(2560, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv1): LoRACompatibleConv(2560, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
(time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
(norm2): GroupNorm(32, 1280, eps=1e-05, affine=True)
(norm2): GroupNorm(32, 1280, eps=1e-05, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv2): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(nonlinearity): SiLU()
(nonlinearity): SiLU()
(conv_shortcut): LoRACompatibleConv(2560, 1280, kernel_size=(1, 1), stride=(1, 1))
(conv_shortcut): LoRACompatibleConv(2560, 1280, kernel_size=(1, 1), stride=(1, 1))
)
)
(2): ResnetBlock2D(
(2): ResnetBlock2D(
(norm1): GroupNorm(32, 1920, eps=1e-05, affine=True)
(norm1): GroupNorm(32, 1920, eps=1e-05, affine=True)
(conv1): LoRACompatibleConv(1920, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv1): LoRACompatibleConv(1920, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
(time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
(norm2): GroupNorm(32, 1280, eps=1e-05, affine=True)
(norm2): GroupNorm(32, 1280, eps=1e-05, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv2): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(nonlinearity): SiLU()
(nonlinearity): SiLU()
(conv_shortcut): LoRACompatibleConv(1920, 1280, kernel_size=(1, 1), stride=(1, 1))
(conv_shortcut): LoRACompatibleConv(1920, 1280, kernel_size=(1, 1), stride=(1, 1))
)
)
)
)
(motion_modules): ModuleList(
(motion_modules): ModuleList(
(0-2): 3 x TransformerTemporalModel(
(0-2): 3 x VanillaTemporalModule(
(norm): GroupNorm(32, 1280, eps=1e-06, affine=True)
(temporal_transformer): TemporalTransformer3DModel(
(proj_in): Linear(in_features=1280, out_features=1280, bias=True)
(norm): GroupNorm(32, 1280, eps=1e-06, affine=True)
(transformer_blocks): ModuleList(
(proj_in): Linear(in_features=1280, out_features=1280, bias=True)
(0): BasicTransformerBlock(
(transformer_blocks): ModuleList(
(pos_embed): SinusoidalPositionalEmbedding()
(0): TemporalTransformerBlock(
(norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(attention_blocks): ModuleList(
(attn1): Attention(
(0-1): 2 x TemporalSelfAttention(
(to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_k): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_k): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_v): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_v): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_out): ModuleList(
(to_out): ModuleList(
(0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
(0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
(1): Dropout(p=0.0, inplace=False)
(1): Dropout(p=0.0, inplace=False)
)
(pos_encoder): PositionalEncoding(
(dropout): Dropout(p=0.0, inplace=False)
)
)
)
)
)
(norms): ModuleList(
(norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(0-1): 2 x LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(attn2): Attention(
(to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_k): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_v): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_out): ModuleList(
(0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
)
(ff): FeedForward(
(norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(net): ModuleList(
(ff): FeedForward(
(0): GEGLU(
(net): ModuleList(
(proj): LoRACompatibleLinear(in_features=1280, out_features=10240, bias=True)
(0): GEGLU(
)
(proj): LoRACompatibleLinear(in_features=1280, out_features=10240, bias=True)
(1): Dropout(p=0.0, inplace=False)
(2): LoRACompatibleLinear(in_features=5120, out_features=1280, bias=True)
)
)
(1): Dropout(p=0.0, inplace=False)
(2): LoRACompatibleLinear(in_features=5120, out_features=1280, bias=True)
)
)
(ff_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
)
)
)
)
(proj_out): Linear(in_features=1280, out_features=1280, bias=True)
)
)
(proj_out): Linear(in_features=1280, out_features=1280, bias=True)
)
)
)
)
(upsamplers): ModuleList(
(upsamplers): ModuleList(
(0): Upsample2D(
(0): Upsample2D(
(conv): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
)
)
)
)
)
(1): CrossAttnUpBlockMotion(
(1): CrossAttnUpBlock3D(
(attentions): ModuleList(
(attentions): ModuleList(
(0-2): 3 x Transformer2DModel(
(0-2): 3 x Transformer2DModel(
(norm): GroupNorm(32, 640, eps=1e-06, affine=True)
(norm): GroupNorm(32, 640, eps=1e-06, affine=True)
(proj_in): LoRACompatibleLinear(in_features=640, out_features=640, bias=True)
(proj_in): LoRACompatibleLinear(in_features=640, out_features=640, bias=True)
(transformer_blocks): ModuleList(
(transformer_blocks): ModuleList(
(0-1): 2 x BasicTransformerBlock(
(0-1): 2 x BasicTransformerBlock(
(norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
(norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
(attn1): Attention(
(attn1): Attention(
(to_q): LoRACompatibleLinear(in_features=640, out_features=640, bias=False)
(to_q): LoRACompatibleLinear(in_features=640, out_features=640, bias=False)
(to_k): LoRACompatibleLinear(in_features=640, out_features=640, bias=False)
(to_k): LoRACompatibleLinear(in_features=640, out_features=640, bias=False)
(to_v): LoRACompatibleLinear(in_features=640, out_features=640, bias=False)
(to_v): LoRACompatibleLinear(in_features=640, out_features=640, bias=False)
(to_out): ModuleList(
(to_out): ModuleList(
(0): LoRACompatibleLinear(in_features=640, out_features=640, bias=True)
(0): LoRACompatibleLinear(in_features=640, out_features=640, bias=True)
(1): Dropout(p=0.0, inplace=False)
(1): Dropout(p=0.0, inplace=False)
)
)
)
)
(norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
(norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
(attn2): Attention(
(attn2): Attention(
(to_q): LoRACompatibleLinear(in_features=640, out_features=640, bias=False)
(to_q): LoRACompatibleLinear(in_features=640, out_features=640, bias=False)
(to_k): LoRACompatibleLinear(in_features=2048, out_features=640, bias=False)
(to_k): LoRACompatibleLinear(in_features=2048, out_features=640, bias=False)
(to_v): LoRACompatibleLinear(in_features=2048, out_features=640, bias=False)
(to_v): LoRACompatibleLinear(in_features=2048, out_features=640, bias=False)
(to_out): ModuleList(
(to_out): ModuleList(
(0): LoRACompatibleLinear(in_features=640, out_features=640, bias=True)
(0): LoRACompatibleLinear(in_features=640, out_features=640, bias=True)
(1): Dropout(p=0.0, inplace=False)
(1): Dropout(p=0.0, inplace=False)
)
)
)
)
(norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
(norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
(ff): FeedForward(
(ff): FeedForward(
(net): ModuleList(
(net): ModuleList(
(0): GEGLU(
(0): GEGLU(
(proj): LoRACompatibleLinear(in_features=640, out_features=5120, bias=True)
(proj): LoRACompatibleLinear(in_features=640, out_features=5120, bias=True)
)
)
(1): Dropout(p=0.0, inplace=False)
(1): Dropout(p=0.0, inplace=False)
(2): LoRACompatibleLinear(in_features=2560, out_features=640, bias=True)
(2): LoRACompatibleLinear(in_features=2560, out_features=640, bias=True)
)
)
)
)
)
)
)
)
(proj_out): LoRACompatibleLinear(in_features=640, out_features=640, bias=True)
(proj_out): LoRACompatibleLinear(in_features=640, out_features=640, bias=True)
)
)
)
)
(resnets): ModuleList(
(resnets): ModuleList(
(0): ResnetBlock2D(
(0): ResnetBlock2D(
(norm1): GroupNorm(32, 1920, eps=1e-05, affine=True)
(norm1): GroupNorm(32, 1920, eps=1e-05, affine=True)
(conv1): LoRACompatibleConv(1920, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv1): LoRACompatibleConv(1920, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=640, bias=True)
(time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=640, bias=True)
(norm2): GroupNorm(32, 640, eps=1e-05, affine=True)
(norm2): GroupNorm(32, 640, eps=1e-05, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): LoRACompatibleConv(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv2): LoRACompatibleConv(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(nonlinearity): SiLU()
(nonlinearity): SiLU()
(conv_shortcut): LoRACompatibleConv(1920, 640, kernel_size=(1, 1), stride=(1, 1))
(conv_shortcut): LoRACompatibleConv(1920, 640, kernel_size=(1, 1), stride=(1, 1))
)
)
(1): ResnetBlock2D(
(1): ResnetBlock2D(
(norm1): GroupNorm(32, 1280, eps=1e-05, affine=True)
(norm1): GroupNorm(32, 1280, eps=1e-05, affine=True)
(conv1): LoRACompatibleConv(1280, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv1): LoRACompatibleConv(1280, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=640, bias=True)
(time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=640, bias=True)
(norm2): GroupNorm(32, 640, eps=1e-05, affine=True)
(norm2): GroupNorm(32, 640, eps=1e-05, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): LoRACompatibleConv(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv2): LoRACompatibleConv(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(nonlinearity): SiLU()
(nonlinearity): SiLU()
(conv_shortcut): LoRACompatibleConv(1280, 640, kernel_size=(1, 1), stride=(1, 1))
(conv_shortcut): LoRACompatibleConv(1280, 640, kernel_size=(1, 1), stride=(1, 1))
)
)
(2): ResnetBlock2D(
(2): ResnetBlock2D(
(norm1): GroupNorm(32, 960, eps=1e-05, affine=True)
(norm1): GroupNorm(32, 960, eps=1e-05, affine=True)
(conv1): LoRACompatibleConv(960, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv1): LoRACompatibleConv(960, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=640, bias=True)
(time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=640, bias=True)
(norm2): GroupNorm(32, 640, eps=1e-05, affine=True)
(norm2): GroupNorm(32, 640, eps=1e-05, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): LoRACompatibleConv(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv2): LoRACompatibleConv(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(nonlinearity): SiLU()
(nonlinearity): SiLU()
(conv_shortcut): LoRACompatibleConv(960, 640, kernel_size=(1, 1), stride=(1, 1))
(conv_shortcut): LoRACompatibleConv(960, 640, kernel_size=(1, 1), stride=(1, 1))
)
)
)
)
(motion_modules): ModuleList(
(motion_modules): ModuleList(
(0-2): 3 x TransformerTemporalModel(
(0-2): 3 x VanillaTemporalModule(
(norm): GroupNorm(32, 640, eps=1e-06, affine=True)
(temporal_transformer): TemporalTransformer3DModel(
(proj_in): Linea
(norm): GroupNorm(32, 640, eps=1e-06, affine=True)
(proj_in): Linear(in_features=640, out_features=640, bias=True)
(transformer_blocks): ModuleList(
(0): TemporalTransformerBlock(
(attention_blocks): ModuleList(
(0-1): 2 x TemporalSelfAttention(
(to_q): LoRACompatibleLinear(in_features=640, out_features=640, bias=False)
(to_k): LoRACompatibleLinear(in_features=640, out_features=640, bias=False)
(to_v): LoRACompatibleLinear(in_features=640, out_features=640, bias=False)
(to_out): ModuleList(
(0): LoRACompatibleLinear(in_features=640, out_features=640, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
(pos_encoder): PositionalEncoding(
(dropout): Dropout(p=0.0, inplace=False)
)
)