Spaces:
Running
on
Zero
Running
on
Zero
| import torch | |
| import torch.nn as nn | |
| from math import log2, pi | |
| from typing import Tuple | |
| import torch.nn.functional as F | |
| from einops import rearrange | |
| from functools import partial | |
| def fourier_dimension_expansion( | |
| x: torch.Tensor, | |
| dim: int = 512, | |
| max_freq: int = 64, | |
| use_cos: bool = True, | |
| use_log: bool = True, | |
| ): | |
| device, dtype, input_dim = x.device, x.dtype, x.shape[-1] | |
| # input_dim: 2 | |
| num_bands = dim // (2 * input_dim) if use_cos else dim // input_dim | |
| # num_bands = 512 // 2 = 256 | |
| if use_log: | |
| scales = 2.0 ** torch.linspace( | |
| 0.0, log2(max_freq), steps=num_bands, device=device, dtype=dtype | |
| ) | |
| else: | |
| scales = torch.linspace( | |
| 1.0, max_freq / 2, num_bands, device=device, dtype=dtype | |
| ) | |
| x = x.unsqueeze(-1) | |
| scales = scales[(*((None,) * (len(x.shape) - 1)), Ellipsis)] | |
| x = x * scales * pi | |
| x = torch.cat( | |
| ( | |
| [x.sin(), x.cos()] | |
| if use_cos | |
| else [ | |
| x.sin(), | |
| ] | |
| ), | |
| dim=-1, | |
| ) | |
| x = x.flatten(-2) | |
| return x | |
| def flatten( | |
| flat_tensor: torch.Tensor, | |
| old: Tuple[int, int], | |
| new: Tuple[int, int], | |
| ) -> torch.Tensor: | |
| if old[0] == new[0] and old[1] == new[1]: | |
| return flat_tensor | |
| tensor = flat_tensor.view(flat_tensor.shape[0], old[0], old[1], -1).permute( | |
| 0, 3, 1, 2 | |
| ) # b c h w | |
| tensor_interp = F.interpolate( | |
| tensor, | |
| size=(new[0], new[1]), | |
| mode='nearest', | |
| ) | |
| flat_tensor_interp = tensor_interp.view( | |
| flat_tensor.shape[0], -1, new[0] * new[1] | |
| ).permute( | |
| 0, 2, 1 | |
| ) # b (h w) c | |
| return flat_tensor_interp.contiguous() | |
| class DimensionAligner(nn.Module): | |
| def __init__(self, input_dims: list[int], hidden_dim: int): | |
| super().__init__() | |
| self.aligners = nn.ModuleList([]) | |
| self.num_chunks = len(input_dims) | |
| self.checkpoint = True | |
| for input_dim in input_dims: | |
| self.aligners.append(nn.Linear(input_dim, hidden_dim)) | |
| def forward(self, xs: torch.Tensor) -> torch.Tensor: | |
| outs = [self.aligners[i](x) for i, x in enumerate(xs)] | |
| return outs | |
| class LayerScale(nn.Module): | |
| def __init__( | |
| self, | |
| dim: int, | |
| init_values: float | torch.Tensor = 1e-5, | |
| inplace: bool = False, | |
| ) -> None: | |
| super().__init__() | |
| self.inplace = inplace | |
| self.gamma = nn.Parameter(init_values * torch.ones(dim)) | |
| def forward(self, x: torch.Tensor) -> torch.Tensor: | |
| return x.mul_(self.gamma) if self.inplace else x * self.gamma | |
| def exists(val): | |
| return val is not None | |
| def default(val, d): | |
| if exists(val): | |
| return val | |
| return d() if callable(d) else d | |
| class SwiGLU(nn.Module): | |
| def forward(self, x: torch.Tensor) -> torch.Tensor: | |
| x, gates = x.chunk(2, dim=-1) | |
| return x * F.silu(gates) | |
| class MLP(nn.Module): | |
| def __init__( | |
| self, | |
| input_dim: int, | |
| expansion: int = 4, | |
| dropout: float = 0.0, | |
| gated: bool = False, | |
| output_dim: int | None = None, | |
| ): | |
| super().__init__() | |
| if gated: | |
| expansion = int(expansion * 2 / 3) | |
| hidden_dim = int(input_dim * expansion) | |
| output_dim = default(output_dim, input_dim) | |
| self.norm = nn.LayerNorm(input_dim) | |
| self.proj1 = nn.Linear(input_dim, hidden_dim) | |
| self.proj2 = nn.Linear(hidden_dim, output_dim) | |
| self.act = nn.GELU() if not gated else SwiGLU() | |
| self.dropout = nn.Dropout(dropout) if dropout > 0.0 else nn.Identity() | |
| def forward(self, x: torch.Tensor) -> torch.Tensor: | |
| x = self.norm(x) | |
| x = self.proj1(x) | |
| x = self.act(x) | |
| x = self.proj2(x) | |
| x = self.dropout(x) | |
| return x | |
| class AttentionBlock(nn.Module): | |
| def __init__( | |
| self, | |
| dim: int, | |
| num_heads: int = 4, | |
| expansion: int = 4, | |
| dropout: float = 0.0, | |
| cosine: bool = False, | |
| gated: bool = False, | |
| layer_scale: float = 1.0, | |
| context_dim: int | None = None, | |
| detach_query: bool = False, | |
| residual_ls: bool = False, | |
| ): | |
| super().__init__() | |
| self.dropout = dropout | |
| self.num_heads = num_heads | |
| self.hidden_dim = dim | |
| context_dim = dim if context_dim is None else context_dim | |
| self.mlp = MLP(dim, expansion=expansion, dropout=dropout, gated=gated) | |
| self.kv = nn.Linear(context_dim, dim * 2, bias=False) | |
| self.q = nn.Linear(dim, dim, bias=False) | |
| self.norm_attnx = nn.LayerNorm(dim) | |
| self.norm_attnctx = nn.LayerNorm(context_dim) | |
| self.cosine = cosine | |
| self.out = nn.Linear(dim, dim, bias=False) | |
| self.ls1_1 = ( | |
| LayerScale(dim, layer_scale) | |
| if layer_scale > 0.0 and not residual_ls | |
| else nn.Identity() | |
| ) | |
| self.ls1_2 = ( | |
| LayerScale(dim, layer_scale) | |
| if layer_scale > 0.0 and residual_ls | |
| else nn.Identity() | |
| ) | |
| self.ls2 = LayerScale(dim, layer_scale) if layer_scale > 0.0 else nn.Identity() | |
| self.detach_query = detach_query | |
| def attn( | |
| self, | |
| x: torch.Tensor, | |
| attn_bias: torch.Tensor | None = None, | |
| context: torch.Tensor | None = None, | |
| pos_embed: torch.Tensor | None = None, | |
| pos_embed_context: torch.Tensor | None = None, | |
| rope: nn.Module | None = None, | |
| rope_pos: torch.Tensor | None = None, | |
| ) -> torch.Tensor: | |
| if self.detach_query: | |
| x = x.detach() | |
| x = self.norm_attnx(x) | |
| context = self.norm_attnctx(context) | |
| k, v = rearrange( | |
| self.kv(context), 'b n (kv h d) -> b h n d kv', h=self.num_heads, kv=2 | |
| ).unbind(dim=-1) | |
| q = rearrange(self.q(x), 'b n (h d) -> b h n d', h=self.num_heads) | |
| if rope is not None: | |
| q = rope(q.permute(0, 2, 1, 3), input_pos=rope_pos).permute(0, 2, 1, 3) | |
| k = rope(k.permute(0, 2, 1, 3), input_pos=rope_pos).permute(0, 2, 1, 3) | |
| else: | |
| if pos_embed is not None: | |
| pos_embed = rearrange( | |
| pos_embed, 'b n (h d) -> b h n d', h=self.num_heads | |
| ) | |
| q = q + pos_embed | |
| if pos_embed_context is not None: | |
| pos_embed_context = rearrange( | |
| pos_embed_context, 'b n (h d) -> b h n d', h=self.num_heads | |
| ) | |
| k = k + pos_embed_context | |
| if self.cosine: | |
| q, k = map(partial(F.normalize, p=2, dim=-1), (q, k)) # cosine sim | |
| x = F.scaled_dot_product_attention( | |
| q, k, v, dropout_p=self.dropout, attn_mask=attn_bias | |
| ) | |
| x = rearrange(x, 'b h n d -> b n (h d)') | |
| x = self.out(x) | |
| return x | |
| def forward( | |
| self, | |
| x: torch.Tensor, | |
| context: torch.Tensor | None = None, | |
| pos_embed: torch.Tensor | None = None, | |
| pos_embed_context: torch.Tensor | None = None, | |
| attn_bias: torch.Tensor | None = None, | |
| rope: nn.Module | None = None, | |
| rope_pos: torch.Tensor | None = None, | |
| ) -> torch.Tensor: | |
| context = x if context is None else context | |
| x = self.ls1_1( | |
| self.attn( | |
| x, | |
| rope=rope, | |
| rope_pos=rope_pos, | |
| attn_bias=attn_bias, | |
| context=context, | |
| pos_embed=pos_embed, | |
| pos_embed_context=pos_embed_context, | |
| ) | |
| ) + self.ls1_2(x) | |
| x = self.ls2(self.mlp(x)) + x | |
| return x | |
| class AttentionSeq(nn.Module): | |
| def __init__( | |
| self, | |
| num_blocks: int, | |
| dim: int, | |
| num_heads: int = 4, | |
| expansion: int = 4, | |
| dropout: float = 0.0, | |
| cosine: bool = False, | |
| gated: bool = False, | |
| layer_scale: float = 1.0, | |
| context_dim: int | None = None, | |
| detach_query: bool = False, | |
| residual_ls: bool = False, | |
| ): | |
| super().__init__() | |
| self.layers = nn.ModuleList( | |
| [ | |
| AttentionBlock( | |
| dim=dim, | |
| num_heads=num_heads, | |
| expansion=expansion, | |
| dropout=dropout, | |
| cosine=cosine, | |
| gated=gated, | |
| layer_scale=layer_scale, | |
| context_dim=context_dim, | |
| detach_query=detach_query, | |
| residual_ls=residual_ls, | |
| ) | |
| for _ in range(num_blocks) | |
| ] | |
| ) | |
| def forward( | |
| self, | |
| x: torch.Tensor, | |
| context: torch.Tensor | None = None, | |
| pos_embed: torch.Tensor | None = None, | |
| pos_embed_context: torch.Tensor | None = None, | |
| attn_bias: torch.Tensor | None = None, | |
| rope: nn.Module | None = None, | |
| rope_pos: torch.Tensor | None = None, | |
| ) -> torch.Tensor: | |
| for layer in self.layers: | |
| x = layer( | |
| x, | |
| context=context, | |
| pos_embed=pos_embed, | |
| pos_embed_context=pos_embed_context, | |
| attn_bias=attn_bias, | |
| rope=rope, | |
| rope_pos=rope_pos, | |
| ) | |
| return x | |
| class ResidualConvNet(nn.Module): | |
| def __init__( | |
| self, | |
| dim, | |
| kernel_size: int = 3, | |
| padding_mode: str = 'zeros', | |
| dilation: int = 1, | |
| layer_scale: float = 1.0, | |
| use_norm: bool = False, | |
| ): | |
| super().__init__() | |
| self.conv1 = nn.Conv2d( | |
| dim, | |
| dim, | |
| kernel_size=kernel_size, | |
| padding=dilation * (kernel_size - 1) // 2, | |
| dilation=dilation, | |
| padding_mode=padding_mode, | |
| ) | |
| self.conv2 = nn.Conv2d( | |
| dim, | |
| dim, | |
| kernel_size=kernel_size, | |
| padding=dilation * (kernel_size - 1) // 2, | |
| dilation=dilation, | |
| padding_mode=padding_mode, | |
| ) | |
| self.activation = nn.LeakyReLU() | |
| self.gamma = ( | |
| nn.Parameter(layer_scale * torch.ones(1, dim, 1, 1)) | |
| if layer_scale > 0.0 | |
| else 1.0 | |
| ) | |
| self.norm1 = nn.GroupNorm(dim // 16, dim) if use_norm else nn.Identity() | |
| self.norm2 = nn.GroupNorm(dim // 16, dim) if use_norm else nn.Identity() | |
| def forward(self, x): | |
| out = self.activation(x) | |
| out = self.conv1(out) | |
| out = self.norm1(out) | |
| out = self.activation(out) | |
| out = self.conv2(out) | |
| out = self.norm2(out) | |
| return self.gamma * out + x | |
| class ResidualUpsampler(nn.Module): | |
| def __init__( | |
| self, | |
| hidden_dim, | |
| output_dim: int = None, | |
| num_layers: int = 2, | |
| kernel_size: int = 3, | |
| layer_scale: float = 1.0, | |
| padding_mode: str = 'zeros', | |
| use_norm: bool = False, | |
| **kwargs, | |
| ): | |
| super().__init__() | |
| output_dim = output_dim if output_dim is not None else hidden_dim // 2 | |
| self.convs = nn.ModuleList([]) | |
| for _ in range(num_layers): | |
| self.convs.append( | |
| ResidualConvNet( | |
| hidden_dim, | |
| kernel_size=kernel_size, | |
| layer_scale=layer_scale, | |
| padding_mode=padding_mode, | |
| use_norm=use_norm, | |
| ) | |
| ) | |
| self.up = nn.Sequential( | |
| nn.Conv2d( | |
| hidden_dim, | |
| output_dim, | |
| kernel_size=1, | |
| padding=0, | |
| padding_mode=padding_mode, | |
| ), | |
| nn.Upsample(scale_factor=2, mode='bilinear', align_corners=False), | |
| ) | |
| def forward(self, x: torch.Tensor): | |
| for conv in self.convs: | |
| x = conv(x) | |
| x = self.up(x) | |
| return x | |