Spaces:

Yiming-M
/

ZIP

Running on Zero

App Files Files Community

Yiming-M commited on Aug 1

Commit

c628976

1 Parent(s): df60639

2025-08-01 10:49 🚀

Browse files

Files changed (8) hide show

app.py +5 -5
models/__init__.py +0 -29
models/clip_ebc/convnext.py +2 -64
models/clip_ebc/mobileclip.py +1 -69
models/clip_ebc/model.py +0 -42
models/clip_ebc/resnet.py +2 -87
models/clip_ebc/vit.py +8 -53
requirements.txt +0 -1

app.py CHANGED Viewed

@@ -791,7 +791,7 @@ with gr.Blocks(css=css, theme=gr.themes.Soft(), title="ZIP Crowd Counting") as d
         gr.Markdown("""
         ### Step-by-step Guide:
-        1. **🎛️ Select Model**: Choose your preferred model variant, pre-trained dataset, and evaluation metric from the dropdown
         2. **📸 Upload Image**: Click the image area to upload your crowd photo or use clipboard
         3. **🚀 Analyze**: Click the "Analyze Crowd" button to start processing
         4. **📊 View Results**: Examine the density maps and crowd count in the output panels
@@ -821,20 +821,20 @@ with gr.Blocks(css=css, theme=gr.themes.Soft(), title="ZIP Crowd Counting") as d
         - **ZIP-N**: Nano model for mobile applications
         - **ZIP-P**: Pico model for edge devices
-        ### Datasets:
         - **ShanghaiTech A**: Dense, low-resolution crowd scenes
         - **ShanghaiTech B**: Sparse, high-resolution crowd scenes
         - **UCF-QNRF**: Dense, ultra high-resolution crowd images
         - **NWPU-Crowd**: Largest ultra high-resolution crowd counting dataset
-        ### Metrics:
-        - **MAE**: Mean Absolute Error - average counting error
         - **NAE**: Normalized Absolute Error - relative counting error
         """)
 demo.launch(
     server_name="0.0.0.0",
-    server_port=7860,
     show_api=False,
     share=False
 )

         gr.Markdown("""
         ### Step-by-step Guide:
+        1. **🎛️ Select Model**: Choose your preferred model variant, pre-training dataset, and pre-training evaluation metric from the dropdown
         2. **📸 Upload Image**: Click the image area to upload your crowd photo or use clipboard
         3. **🚀 Analyze**: Click the "Analyze Crowd" button to start processing
         4. **📊 View Results**: Examine the density maps and crowd count in the output panels
         - **ZIP-N**: Nano model for mobile applications
         - **ZIP-P**: Pico model for edge devices
+        ### Pre-trainining Datasets:
         - **ShanghaiTech A**: Dense, low-resolution crowd scenes
         - **ShanghaiTech B**: Sparse, high-resolution crowd scenes
         - **UCF-QNRF**: Dense, ultra high-resolution crowd images
         - **NWPU-Crowd**: Largest ultra high-resolution crowd counting dataset
+        ### Pre-trainining Evaluation Metrics:
+        - **MAE**: Mean Absolute Error - average counting error.
         - **NAE**: Normalized Absolute Error - relative counting error
         """)
 demo.launch(
     server_name="0.0.0.0",
+    server_port=7861,
     show_api=False,
     share=False
 )

models/__init__.py CHANGED Viewed

@@ -17,12 +17,6 @@ def get_model(
     num_vpt: Optional[int] = None,
     vpt_drop: Optional[float] = None,
     input_size: Optional[int] = None,
-    adapter: bool = False,
-    adapter_reduction: Optional[int] = None,
-    lora: bool = False,
-    lora_rank: Optional[int] = None,
-    lora_alpha: Optional[int] = None,
-    lora_dropout: Optional[float] = None,
     norm: str = "none",
     act: str = "none",
     text_prompts: Optional[List[str]] = None
@@ -41,15 +35,6 @@ def get_model(
         num_vpt = model_info["config"].get("num_vpt", None)
         vpt_drop = model_info["config"].get("vpt_drop", None)
-        adapter = model_info["config"].get("adapter", False)
-        adapter_reduction = model_info["config"].get("adapter_reduction", None)
-        lora = model_info["config"].get("lora", False)
-        lora_rank = model_info["config"].get("lora_rank", None)
-        lora_alpha = model_info["config"].get("lora_alpha", None)
-        lora_dropout = model_info["config"].get("lora_dropout", None)
         input_size = model_info["config"].get("input_size", None)
         text_prompts = model_info["config"].get("text_prompts", None)
@@ -81,12 +66,6 @@ def get_model(
             num_vpt=num_vpt,
             vpt_drop=vpt_drop,
             input_size=input_size,
-            adapter=adapter,
-            adapter_reduction=adapter_reduction,
-            lora=lora,
-            lora_rank=lora_rank,
-            lora_alpha=lora_alpha,
-            lora_dropout=lora_dropout,
             text_prompts=text_prompts,
             norm=norm,
             act=act
@@ -101,20 +80,12 @@ def get_model(
             "num_vpt": num_vpt,
             "vpt_drop": vpt_drop,
             "input_size": input_size,
-            "adapter": adapter,
-            "adapter_reduction": adapter_reduction,
-            "lora": lora,
-            "lora_rank": lora_rank,
-            "lora_alpha": lora_alpha,
-            "lora_dropout": lora_dropout,
             "text_prompts": model.text_prompts,
             "norm": norm,
             "act": act
         }
     else:
-        assert not adapter, "adapter for non-CLIP models is not implemented yet"
-        assert not lora, "lora for non-CLIP models is not implemented yet"
         model = _ebc(
             model_name=model_name,
             block_size=block_size,

     num_vpt: Optional[int] = None,
     vpt_drop: Optional[float] = None,
     input_size: Optional[int] = None,
     norm: str = "none",
     act: str = "none",
     text_prompts: Optional[List[str]] = None
         num_vpt = model_info["config"].get("num_vpt", None)
         vpt_drop = model_info["config"].get("vpt_drop", None)
         input_size = model_info["config"].get("input_size", None)
         text_prompts = model_info["config"].get("text_prompts", None)
             num_vpt=num_vpt,
             vpt_drop=vpt_drop,
             input_size=input_size,
             text_prompts=text_prompts,
             norm=norm,
             act=act
             "num_vpt": num_vpt,
             "vpt_drop": vpt_drop,
             "input_size": input_size,
             "text_prompts": model.text_prompts,
             "norm": norm,
             "act": act
         }
     else:
         model = _ebc(
             model_name=model_name,
             block_size=block_size,

models/clip_ebc/convnext.py CHANGED Viewed

@@ -1,8 +1,7 @@
 from torch import nn, Tensor
 import open_clip
-from peft import get_peft_model, LoraConfig
-from ..utils import ConvRefine, ConvAdapter
 from ..utils import ConvUpsample, _get_norm_layer, _get_activation
@@ -41,8 +40,6 @@ class ConvNeXt(nn.Module):
         model_name: str,
         weight_name: str,
         block_size: int = 16,
-        adapter: bool = False,
-        adapter_reduction: int = 4,
         norm: str = "none",
         act: str = "none"
     ) -> None:
@@ -55,22 +52,11 @@ class ConvNeXt(nn.Module):
         # model = open_clip.create_model_from_pretrained(model_name, weight_name, return_transform=False).visual
         model = open_clip.create_model(model_name=model_name, pretrained=False, load_weights=False).visual
-        self.adapter = adapter
-        if adapter:
-            self.adapter_reduction = adapter_reduction
-            for param in model.parameters():
-                param.requires_grad = False
         self.stem = model.trunk.stem
         self.depth = len(model.trunk.stages)
         for idx, stage in enumerate(model.trunk.stages):
             setattr(self, f"stage{idx}", stage)
-            if adapter:
-                setattr(self, f"adapter{idx}", ConvAdapter(
-                    in_channels=stage.blocks[-1].mlp.fc2.out_features,
-                    bottleneck_channels=stage.blocks[-1].mlp.fc2.out_features // adapter_reduction,
-                ) if idx < self.depth - 1 else nn.Identity())  # No adapter for the last stage
         if self.model_name in ["convnext_base", "convnext_base_w", "convnext_base_w_320", "convnext_xxlarge"]:
             self.in_features, self.out_features = model.head.proj.in_features, model.head.proj.out_features
@@ -125,30 +111,12 @@ class ConvNeXt(nn.Module):
                 ),
             )
-    def train(self, mode: bool = True):
-        if self.adapter and mode:
-            # training:
-            self.stem.eval()
-            for idx in range(self.depth):
-                getattr(self, f"stage{idx}").eval()
-                getattr(self, f"adapter{idx}").train()
-            self.refiner.train()
-        else:
-            # evaluation:
-            for module in self.children():
-                module.train(mode)
     def forward(self, x: Tensor) -> Tensor:
         x = self.stem(x)
         for idx in range(self.depth):
             x = getattr(self, f"stage{idx}")(x)
-            if self.adapter:
-                x = getattr(self, f"adapter{idx}")(x)
         x = self.refiner(x)
         return x
@@ -157,44 +125,14 @@ def _convnext(
     model_name: str,
     weight_name: str,
     block_size: int = 16,
-    adapter: bool = False,
-    adapter_reduction: int = 4,
-    lora: bool = False,
-    lora_rank: int = 16,
-    lora_alpha: float = 32.0,
-    lora_dropout: float = 0.1,
     norm: str = "none",
     act: str = "none"
 ) -> ConvNeXt:
-    assert not (lora and adapter), "Lora and adapter cannot be used together."
     model = ConvNeXt(
         model_name=model_name,
         weight_name=weight_name,
         block_size=block_size,
-        adapter=adapter,
-        adapter_reduction=adapter_reduction,
         norm=norm,
         act=act
     )
-    if lora:
-        target_modules = []
-        for name, module in model.named_modules():
-            if isinstance(module, (nn.Linear, nn.Conv2d)) and "refiner" not in name:
-                target_modules.append(name)
-        lora_config = LoraConfig(
-            r=lora_rank,
-            lora_alpha=lora_alpha,
-            lora_dropout=lora_dropout,
-            bias="none",
-            target_modules=target_modules,
-        )
-        model = get_peft_model(model, lora_config)
-        # Unfreeze refiner
-        for name, module in model.named_modules():
-            if "refiner" in name:
-                module.requires_grad_(True)
     return model

 from torch import nn, Tensor
 import open_clip
+from ..utils import ConvRefine
 from ..utils import ConvUpsample, _get_norm_layer, _get_activation
         model_name: str,
         weight_name: str,
         block_size: int = 16,
         norm: str = "none",
         act: str = "none"
     ) -> None:
         # model = open_clip.create_model_from_pretrained(model_name, weight_name, return_transform=False).visual
         model = open_clip.create_model(model_name=model_name, pretrained=False, load_weights=False).visual
         self.stem = model.trunk.stem
         self.depth = len(model.trunk.stages)
         for idx, stage in enumerate(model.trunk.stages):
             setattr(self, f"stage{idx}", stage)
         if self.model_name in ["convnext_base", "convnext_base_w", "convnext_base_w_320", "convnext_xxlarge"]:
             self.in_features, self.out_features = model.head.proj.in_features, model.head.proj.out_features
                 ),
             )
     def forward(self, x: Tensor) -> Tensor:
         x = self.stem(x)
         for idx in range(self.depth):
             x = getattr(self, f"stage{idx}")(x)
         x = self.refiner(x)
         return x
     model_name: str,
     weight_name: str,
     block_size: int = 16,
     norm: str = "none",
     act: str = "none"
 ) -> ConvNeXt:
     model = ConvNeXt(
         model_name=model_name,
         weight_name=weight_name,
         block_size=block_size,
         norm=norm,
         act=act
     )
     return model

models/clip_ebc/mobileclip.py CHANGED Viewed

@@ -1,8 +1,7 @@
 from torch import nn, Tensor
 import open_clip
-from peft import get_peft_model, LoraConfig
-from ..utils import ConvRefine, ConvUpsample, ConvAdapter
 from ..utils import _get_norm_layer, _get_activation
@@ -29,8 +28,6 @@ class MobileCLIP(nn.Module):
         model_name: str,
         weight_name: str,
         block_size: int = 16,
-        adapter: bool = False,
-        adapter_reduction: int = 4,
         norm: str = "none",
         act: str = "none"
     ) -> None:
@@ -44,21 +41,10 @@ class MobileCLIP(nn.Module):
         # model = open_clip.create_model_from_pretrained(model_name, weight_name, return_transform=False).visual
         model = open_clip.create_model(model_name=model_name, pretrained=False, load_weights=False).visual
-        self.adapter = adapter
-        if adapter:
-            for param in model.parameters():
-                param.requires_grad = False
         self.stem = model.trunk.stem
         self.stages = model.trunk.stages
         self.depth = len(model.trunk.stages)
-        for idx, stage in enumerate(model.trunk.stages):
-            if adapter:
-                setattr(self, f"adapter{idx}", ConvAdapter(
-                    in_channels=stage.blocks[-1].mlp.fc2.out_channels,
-                    bottleneck_channels=stage.blocks[-1].mlp.fc2.out_channels // adapter_reduction,
-                ))
         self.final_conv = model.trunk.final_conv
@@ -114,31 +100,12 @@ class MobileCLIP(nn.Module):
                     groups=refiner_groups[self.model_name],
                 ),
             )
-    def train(self, mode: bool = True):
-        if self.adapter and mode:
-            # training:
-            self.stem.eval()
-            for idx in range(self.depth):
-                getattr(self, f"stage{idx}").eval()
-                getattr(self, f"adapter{idx}").train()
-            self.final_conv.eval()
-            self.refiner.train()
-        else:
-            # evaluation:
-            for module in self.children():
-                module.train(mode)
     def forward(self, x: Tensor) -> Tensor:
         x = self.stem(x)
         for idx in range(self.depth):
             x = self.stages[idx](x)
-            if self.adapter:
-                x = getattr(self, f"adapter{idx}")(x)
         x = self.final_conv(x)
@@ -150,49 +117,14 @@ def _mobileclip(
     model_name: str,
     weight_name: str,
     block_size: int = 16,
-    adapter: bool = False,
-    adapter_reduction: int = 4,
-    lora: bool = False,
-    lora_rank: int = 16,
-    lora_alpha: float = 32.0,
-    lora_dropout: float = 0.1,
     norm: str = "none",
     act: str = "none"
 ) -> MobileCLIP:
-    assert not (lora and adapter), "Lora and adapter cannot be used together."
     model = MobileCLIP(
         model_name=model_name,
         weight_name=weight_name,
         block_size=block_size,
-        adapter=adapter,
-        adapter_reduction=adapter_reduction,
         norm=norm,
         act=act
     )
-    if lora:
-        target_modules = []
-        for name, module in model.named_modules():
-            if isinstance(module, (nn.Linear, nn.Conv2d)):
-                target_modules.append(name)
-        lora_config = LoraConfig(
-            r=lora_rank,
-            lora_alpha=lora_alpha,
-            lora_dropout=lora_dropout,
-            bias="none",
-            target_modules=target_modules,
-        )
-        model = get_peft_model(model, lora_config)
-        # Unfreeze the BN layers
-        for name, module in model.named_modules() and "refiner" not in name:
-            if isinstance(module, nn.BatchNorm2d):
-                module.requires_grad_(True)
-        # Unfreeze refiner
-        for name, module in model.named_modules():
-            if "refiner" in name:
-                module.requires_grad_(True)
     return model

 from torch import nn, Tensor
 import open_clip
+from ..utils import ConvRefine, ConvUpsample
 from ..utils import _get_norm_layer, _get_activation
         model_name: str,
         weight_name: str,
         block_size: int = 16,
         norm: str = "none",
         act: str = "none"
     ) -> None:
         # model = open_clip.create_model_from_pretrained(model_name, weight_name, return_transform=False).visual
         model = open_clip.create_model(model_name=model_name, pretrained=False, load_weights=False).visual
         self.stem = model.trunk.stem
         self.stages = model.trunk.stages
         self.depth = len(model.trunk.stages)
         self.final_conv = model.trunk.final_conv
                     groups=refiner_groups[self.model_name],
                 ),
             )
     def forward(self, x: Tensor) -> Tensor:
         x = self.stem(x)
         for idx in range(self.depth):
             x = self.stages[idx](x)
         x = self.final_conv(x)
     model_name: str,
     weight_name: str,
     block_size: int = 16,
     norm: str = "none",
     act: str = "none"
 ) -> MobileCLIP:
     model = MobileCLIP(
         model_name=model_name,
         weight_name=weight_name,
         block_size=block_size,
         norm=norm,
         act=act
     )
     return model

models/clip_ebc/model.py CHANGED Viewed

@@ -31,12 +31,6 @@ class CLIP_EBC(nn.Module):
         num_vpt: Optional[int] = None,
         vpt_drop: Optional[float] = None,
         input_size: Optional[int] = None,
-        adapter: Optional[bool] = False,
-        adapter_reduction: Optional[int] = None,
-        lora: Optional[bool] = False,
-        lora_rank: Optional[int] = None,
-        lora_alpha: Optional[float] = None,
-        lora_dropout: Optional[float] = None,
         text_prompts: Optional[Dict[str, List[str]]] = None,
         norm: Optional[str] = "none",
         act: Optional[str] = "none",
@@ -70,12 +64,6 @@ class CLIP_EBC(nn.Module):
                 num_vpt=num_vpt,
                 vpt_drop=vpt_drop,
                 block_size=block_size,
-                adapter=adapter,
-                adapter_reduction=adapter_reduction,
-                lora=lora,
-                lora_rank=lora_rank,
-                lora_alpha=lora_alpha,
-                lora_dropout=lora_dropout,
                 input_size=(input_size, input_size),
                 norm=norm,
                 act=act
@@ -85,12 +73,6 @@ class CLIP_EBC(nn.Module):
                 model_name=model_name,
                 weight_name=weight_name,
                 block_size=block_size,
-                adapter=adapter,
-                adapter_reduction=adapter_reduction,
-                lora=lora,
-                lora_rank=lora_rank,
-                lora_alpha=lora_alpha,
-                lora_dropout=lora_dropout,
                 norm=norm,
                 act=act
             )
@@ -99,12 +81,6 @@ class CLIP_EBC(nn.Module):
                 model_name=model_name,
                 weight_name=weight_name,
                 block_size=block_size,
-                adapter=adapter,
-                adapter_reduction=adapter_reduction,
-                lora=lora,
-                lora_rank=lora_rank,
-                lora_alpha=lora_alpha,
-                lora_dropout=lora_dropout,
                 norm=norm,
                 act=act
             )
@@ -113,12 +89,6 @@ class CLIP_EBC(nn.Module):
                 model_name=model_name,
                 weight_name=weight_name,
                 block_size=block_size,
-                adapter=adapter,
-                adapter_reduction=adapter_reduction,
-                lora=lora,
-                lora_rank=lora_rank,
-                lora_alpha=lora_alpha,
-                lora_dropout=lora_dropout,
                 norm=norm,
                 act=act
             )
@@ -240,12 +210,6 @@ def _clip_ebc(
     num_vpt: Optional[int] = None,
     vpt_drop: Optional[float] = None,
     input_size: Optional[int] = None,
-    adapter: Optional[bool] = False,
-    adapter_reduction: Optional[int] = None,
-    lora: Optional[bool] = False,
-    lora_rank: Optional[int] = None,
-    lora_alpha: Optional[float] = None,
-    lora_dropout: Optional[float] = None,
     text_prompts: Optional[List[str]] = None,
     norm: Optional[str] = "none",
     act: Optional[str] = "none",
@@ -260,12 +224,6 @@ def _clip_ebc(
         num_vpt=num_vpt,
         vpt_drop=vpt_drop,
         input_size=input_size,
-        adapter=adapter,
-        adapter_reduction=adapter_reduction,
-        lora=lora,
-        lora_rank=lora_rank,
-        lora_alpha=lora_alpha,
-        lora_dropout=lora_dropout,
         text_prompts=text_prompts,
         norm=norm,
         act=act,

         num_vpt: Optional[int] = None,
         vpt_drop: Optional[float] = None,
         input_size: Optional[int] = None,
         text_prompts: Optional[Dict[str, List[str]]] = None,
         norm: Optional[str] = "none",
         act: Optional[str] = "none",
                 num_vpt=num_vpt,
                 vpt_drop=vpt_drop,
                 block_size=block_size,
                 input_size=(input_size, input_size),
                 norm=norm,
                 act=act
                 model_name=model_name,
                 weight_name=weight_name,
                 block_size=block_size,
                 norm=norm,
                 act=act
             )
                 model_name=model_name,
                 weight_name=weight_name,
                 block_size=block_size,
                 norm=norm,
                 act=act
             )
                 model_name=model_name,
                 weight_name=weight_name,
                 block_size=block_size,
                 norm=norm,
                 act=act
             )
     num_vpt: Optional[int] = None,
     vpt_drop: Optional[float] = None,
     input_size: Optional[int] = None,
     text_prompts: Optional[List[str]] = None,
     norm: Optional[str] = "none",
     act: Optional[str] = "none",
         num_vpt=num_vpt,
         vpt_drop=vpt_drop,
         input_size=input_size,
         text_prompts=text_prompts,
         norm=norm,
         act=act,

models/clip_ebc/resnet.py CHANGED Viewed

@@ -1,8 +1,7 @@
 from torch import nn, Tensor
 import open_clip
-from peft import get_peft_model, LoraConfig
-from ..utils import ConvRefine, ConvUpsample, ConvAdapter
 from ..utils import _get_norm_layer, _get_activation
@@ -37,8 +36,6 @@ class ResNet(nn.Module):
         model_name: str,
         weight_name: str,
         block_size: int = 16,
-        adapter: bool = False,
-        adapter_reduction: int = 4,
         norm: str = "none",
         act: str = "none"
     ) -> None:
@@ -52,11 +49,6 @@ class ResNet(nn.Module):
         # model = open_clip.create_model_from_pretrained(model_name, weight_name, return_transform=False).visual
         model = open_clip.create_model(model_name=model_name, pretrained=False, load_weights=False).visual
-        self.adapter = adapter
-        if adapter:
-            for param in model.parameters():
-                param.requires_grad = False
         # Stem
         self.conv1 = model.conv1
         self.bn1 = model.bn1
@@ -73,12 +65,7 @@ class ResNet(nn.Module):
         # Layers
         for idx in range(1, 5):
             setattr(self, f"layer{idx}", getattr(model, f"layer{idx}"))
-            if adapter:
-                setattr(self, f"adapter{idx}", ConvAdapter(
-                    in_channels=getattr(model, f"layer{idx}")[-1].conv3.out_channels,
-                    bottleneck_channels=getattr(model, f"layer{idx}")[-1].conv3.out_channels // adapter_reduction,
-                ) if idx < 4 else nn.Identity())  # No adapter for the last layer
         self.in_features = model.attnpool.c_proj.weight.shape[1]
         self.out_features = model.attnpool.c_proj.weight.shape[0]
@@ -129,31 +116,6 @@ class ResNet(nn.Module):
                     groups=refiner_groups[self.model_name],
                 ),
             )
-    def train(self, mode: bool = True):
-        if self.adapter and mode:
-            # training:
-            self.conv1.eval()
-            self.bn1.eval()
-            self.act1.eval()
-            self.conv2.eval()
-            self.bn2.eval()
-            self.act2.eval()
-            self.conv3.eval()
-            self.bn3.eval()
-            self.act3.eval()
-            self.avgpool.eval()
-            for idx in range(1, 5):
-                getattr(self, f"layer{idx}").eval()
-                getattr(self, f"adapter{idx}").train()
-            self.refiner.train()
-        else:
-            # evaluation:
-            for module in self.children():
-                module.train(mode)
     def stem(self, x: Tensor) -> Tensor:
         x = self.act1(self.bn1(self.conv1(x)))
@@ -166,21 +128,9 @@ class ResNet(nn.Module):
         x = self.stem(x)
         x = self.layer1(x)
-        if self.adapter:
-            x = self.adapter1(x)
         x = self.layer2(x)
-        if self.adapter:
-            x = self.adapter2(x)
         x = self.layer3(x)
-        if self.adapter:
-            x = self.adapter3(x)
         x = self.layer4(x)
-        if self.adapter:
-            x = self.adapter4(x)
         x = self.refiner(x)
         return x
@@ -189,49 +139,14 @@ def _resnet(
     model_name: str,
     weight_name: str,
     block_size: int = 16,
-    adapter: bool = False,
-    adapter_reduction: int = 4,
-    lora: bool = False,
-    lora_rank: int = 16,
-    lora_alpha: float = 32.0,
-    lora_dropout: float = 0.1,
     norm: str = "none",
     act: str = "none"
 ) -> ResNet:
-    assert not (lora and adapter), "Lora and adapter cannot be used together."
     model = ResNet(
         model_name=model_name,
         weight_name=weight_name,
         block_size=block_size,
-        adapter=adapter,
-        adapter_reduction=adapter_reduction,
         norm=norm,
         act=act
     )
-    if lora:
-        target_modules = []
-        for name, module in model.named_modules():
-            if isinstance(module, (nn.Linear, nn.Conv2d)):
-                target_modules.append(name)
-        lora_config = LoraConfig(
-            r=lora_rank,
-            lora_alpha=lora_alpha,
-            lora_dropout=lora_dropout,
-            bias="none",
-            target_modules=target_modules,
-        )
-        model = get_peft_model(model, lora_config)
-        # Unfreeze BN layers
-        for name, module in model.named_modules():
-            if isinstance(module, nn.BatchNorm2d) and "refiner" not in name:
-                module.requires_grad_(True)
-        # Unfreeze refiner
-        for name, module in model.named_modules():
-            if "refiner" in name:
-                module.requires_grad_(True)
     return model

 from torch import nn, Tensor
 import open_clip
+from ..utils import ConvRefine, ConvUpsample
 from ..utils import _get_norm_layer, _get_activation
         model_name: str,
         weight_name: str,
         block_size: int = 16,
         norm: str = "none",
         act: str = "none"
     ) -> None:
         # model = open_clip.create_model_from_pretrained(model_name, weight_name, return_transform=False).visual
         model = open_clip.create_model(model_name=model_name, pretrained=False, load_weights=False).visual
         # Stem
         self.conv1 = model.conv1
         self.bn1 = model.bn1
         # Layers
         for idx in range(1, 5):
             setattr(self, f"layer{idx}", getattr(model, f"layer{idx}"))
         self.in_features = model.attnpool.c_proj.weight.shape[1]
         self.out_features = model.attnpool.c_proj.weight.shape[0]
                     groups=refiner_groups[self.model_name],
                 ),
             )
     def stem(self, x: Tensor) -> Tensor:
         x = self.act1(self.bn1(self.conv1(x)))
         x = self.stem(x)
         x = self.layer1(x)
         x = self.layer2(x)
         x = self.layer3(x)
         x = self.layer4(x)
         x = self.refiner(x)
         return x
     model_name: str,
     weight_name: str,
     block_size: int = 16,
     norm: str = "none",
     act: str = "none"
 ) -> ResNet:
     model = ResNet(
         model_name=model_name,
         weight_name=weight_name,
         block_size=block_size,
         norm=norm,
         act=act
     )
     return model

models/clip_ebc/vit.py CHANGED Viewed

@@ -3,10 +3,9 @@ from torch import nn, Tensor
 import math
 from einops import rearrange
 import open_clip
-from peft import get_peft_model, LoraConfig
 from typing import Optional, Tuple
-from ..utils import interpolate_pos_embed, ViTAdapter
 # from ..utils import TransformerRefine, TransformerDownsample, TransformerUpsample
 from ..utils import ConvRefine, ConvDownsample, ConvUpsample
 from ..utils import _get_norm_layer, _get_activation
@@ -73,8 +72,6 @@ class ViT(nn.Module):
         block_size: int = 16,
         num_vpt: int = 32,
         vpt_drop: float = 0.0,
-        adapter: bool = False,
-        adapter_reduction: int = 4,
         input_size: Optional[Tuple[int, int]] = None,
         norm: str = "none",
         act: str = "none"
@@ -82,18 +79,14 @@ class ViT(nn.Module):
         super(ViT, self).__init__()
         assert model_name in vit_names_and_weights, f"Model name should be one of {list(vit_names_and_weights.keys())}, but got {model_name}."
         assert weight_name in vit_names_and_weights[model_name], f"Pretrained should be one of {vit_names_and_weights[model_name]}, but got {weight_name}."
-        if adapter:
-            assert num_vpt is None or num_vpt == 0, "num_vpt should be None or 0 when using adapter."
-            assert vpt_drop is None or vpt_drop == 0.0, "vpt_drop should be None or 0.0 when using adapter."
-        else:
-            assert num_vpt > 0, f"Number of VPT tokens should be greater than 0, but got {num_vpt}."
-            assert 0.0 <= vpt_drop < 1.0, f"VPT dropout should be in [0.0, 1.0), but got {vpt_drop}."
         self.model_name, self.weight_name = model_name, weight_name
         self.block_size = block_size
         self.num_vpt = num_vpt
         self.vpt_drop = vpt_drop
-        self.adapter = adapter
         # model = open_clip.create_model_from_pretrained(model_name, weight_name, return_transform=False).visual
         model = open_clip.create_model(model_name=model_name, pretrained=False, load_weights=False).visual
@@ -119,15 +112,9 @@ class ViT(nn.Module):
         # Setup VPT tokens
         val = math.sqrt(6. / float(3 * self.patch_size[0] + self.embed_dim))
         for idx in range(self.num_layers):
-            if self.adapter:
-                setattr(self, f"adapter{idx}", ViTAdapter(
-                    in_channels=self.embed_dim,
-                    bottleneck_channels=self.embed_dim // adapter_reduction,
-                ))
-            else:
-                setattr(self, f"vpt_{idx}", nn.Parameter(torch.empty(self.num_vpt, self.embed_dim)))
-                nn.init.uniform_(getattr(self, f"vpt_{idx}"), -val, val)
-                setattr(self, f"vpt_drop_{idx}", nn.Dropout(self.vpt_drop))
         # Adjust the positional embedding to match the new input size
         self._adjust_pos_embed()
@@ -299,13 +286,10 @@ class ViT(nn.Module):
         return x
-    def _forward_adapter(self, x: Tensor, idx: int) -> Tensor:
-        return getattr(self, f"adapter{idx}")(x)
     def forward_encoder(self, x: Tensor) -> Tensor:
         x = self._forward_patch_embed(x)
         for idx in range(self.num_layers):
-            x = self._forward_adapter(x, idx) if self.adapter else self._forward_vpt(x, idx)
         x = self.ln_post(x)
         return x
@@ -326,48 +310,19 @@ def _vit(
     block_size: int = 16,
     num_vpt: int = 32,
     vpt_drop: float = 0.1,
-    adapter: bool = False,
-    adapter_reduction: int = 4,
-    lora: bool = False,
-    lora_rank: int = 16,
-    lora_alpha: float = 32.0,
-    lora_dropout: float = 0.1,
     input_size: Optional[Tuple[int, int]] = None,
     norm: str = "none",
     act: str = "none"
 ) -> ViT:
-    assert not (lora and adapter), "LoRA and adapter cannot be used together."
     model = ViT(
         model_name=model_name,
         weight_name=weight_name,
         block_size=block_size,
         num_vpt=num_vpt,
         vpt_drop=vpt_drop,
-        adapter=adapter,
-        adapter_reduction=adapter_reduction,
         input_size=input_size,
         norm=norm,
         act=act
     )
-    if lora:
-        target_modules = []
-        for name, module in model.named_modules():
-            if isinstance(module, (nn.Linear, nn.Conv2d, nn.MultiheadAttention)) and "refiner" not in name:
-                target_modules.append(name)
-        lora_config = LoraConfig(
-            r=lora_rank,
-            lora_alpha=lora_alpha,
-            lora_dropout=lora_dropout,
-            bias="none",
-            target_modules=target_modules,
-        )
-        model = get_peft_model(model, lora_config)
-        # Unfreeze refiner
-        for name, module in model.named_modules():
-            if "refiner" in name:
-                module.requires_grad_(True)
     return model

 import math
 from einops import rearrange
 import open_clip
 from typing import Optional, Tuple
+from ..utils import interpolate_pos_embed
 # from ..utils import TransformerRefine, TransformerDownsample, TransformerUpsample
 from ..utils import ConvRefine, ConvDownsample, ConvUpsample
 from ..utils import _get_norm_layer, _get_activation
         block_size: int = 16,
         num_vpt: int = 32,
         vpt_drop: float = 0.0,
         input_size: Optional[Tuple[int, int]] = None,
         norm: str = "none",
         act: str = "none"
         super(ViT, self).__init__()
         assert model_name in vit_names_and_weights, f"Model name should be one of {list(vit_names_and_weights.keys())}, but got {model_name}."
         assert weight_name in vit_names_and_weights[model_name], f"Pretrained should be one of {vit_names_and_weights[model_name]}, but got {weight_name}."
+        assert num_vpt > 0, f"Number of VPT tokens should be greater than 0, but got {num_vpt}."
+        assert 0.0 <= vpt_drop < 1.0, f"VPT dropout should be in [0.0, 1.0), but got {vpt_drop}."
         self.model_name, self.weight_name = model_name, weight_name
         self.block_size = block_size
         self.num_vpt = num_vpt
         self.vpt_drop = vpt_drop
         # model = open_clip.create_model_from_pretrained(model_name, weight_name, return_transform=False).visual
         model = open_clip.create_model(model_name=model_name, pretrained=False, load_weights=False).visual
         # Setup VPT tokens
         val = math.sqrt(6. / float(3 * self.patch_size[0] + self.embed_dim))
         for idx in range(self.num_layers):
+            setattr(self, f"vpt_{idx}", nn.Parameter(torch.empty(self.num_vpt, self.embed_dim)))
+            nn.init.uniform_(getattr(self, f"vpt_{idx}"), -val, val)
+            setattr(self, f"vpt_drop_{idx}", nn.Dropout(self.vpt_drop))
         # Adjust the positional embedding to match the new input size
         self._adjust_pos_embed()
         return x
     def forward_encoder(self, x: Tensor) -> Tensor:
         x = self._forward_patch_embed(x)
         for idx in range(self.num_layers):
+            x = self._forward_vpt(x, idx)
         x = self.ln_post(x)
         return x
     block_size: int = 16,
     num_vpt: int = 32,
     vpt_drop: float = 0.1,
     input_size: Optional[Tuple[int, int]] = None,
     norm: str = "none",
     act: str = "none"
 ) -> ViT:
     model = ViT(
         model_name=model_name,
         weight_name=weight_name,
         block_size=block_size,
         num_vpt=num_vpt,
         vpt_drop=vpt_drop,
         input_size=input_size,
         norm=norm,
         act=act
     )
     return model

requirements.txt CHANGED Viewed

@@ -3,7 +3,6 @@ gradio==5.23.1
 huggingface_hub==0.29.3
 matplotlib==3.10.1
 numpy==2.2.4
-peft==0.7.0
 Pillow==11.3.0
 spaces==0.39.0
 timm==1.0.19

 huggingface_hub==0.29.3
 matplotlib==3.10.1
 numpy==2.2.4
 Pillow==11.3.0
 spaces==0.39.0
 timm==1.0.19