Spaces:

EPFL-VILAB
/

MultiMAE

Running

App Files Files Community

Bachmann Roman Christian commited on Apr 14, 2022

Commit

a6ebf2a

1 Parent(s): 3b49518

.

Browse files

Files changed (1) hide show

app.py +11 -15

app.py CHANGED Viewed

@@ -26,6 +26,7 @@ from mpl_toolkits.axes_grid1 import ImageGrid
 from tqdm import tqdm
 import random
 from functools import partial
 # import some common detectron2 utilities
 from detectron2 import model_zoo
@@ -290,7 +291,7 @@ def plot_predictions(input_dict, preds, masks, image_size=224):
     plt.close()
-def inference(img, num_rgb, num_depth, num_semseg, seed, perform_sampling, alphas, num_tokens):
     im = Image.open(img)
     # Center crop and resize RGB
@@ -324,21 +325,22 @@ def inference(img, num_rgb, num_depth, num_semseg, seed, perform_sampling, alpha
     input_dict = {k: v.to(device) for k,v in input_dict.items()}
-    torch.manual_seed(int(seed)) # change seed to resample new mask
     if perform_sampling:
         # Randomly sample masks
-        alphas = min(10000.0, max(0.00001, float(alphas))) # Clamp alphas to reasonable range
         preds, masks = multimae.forward(
             input_dict,
             mask_inputs=True, # True if forward pass should sample random masks
             num_encoded_tokens=num_tokens,
-            alphas=alphas
         )
     else:
         # Randomly sample masks using the specified number of tokens per modality
         task_masks = {domain: torch.ones(1,196).long().to(device) for domain in DOMAINS}
         selected_rgb_idxs = torch.randperm(196)[:num_rgb]
         selected_depth_idxs = torch.randperm(196)[:num_depth]
@@ -365,7 +367,7 @@ title = "MultiMAE"
 description = "Gradio demo for MultiMAE: Multi-modal Multi-task Masked Autoencoders. \
     Upload your own images or try one of the examples below to explore the multi-modal masked reconstruction of a pre-trained MultiMAE model. \
     Uploaded images are pseudo labeled using a DPT trained on Omnidata depth, and a Mask2Former trained on COCO. \
-    Choose the number of visible tokens using the sliders below (or sample them randomly) and see how MultiMAE reconstructs the modalities!"
 article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2204.01678' \
            target='_blank'>MultiMAE: Multi-modal Multi-task Masked Autoencoders</a> | \
@@ -375,24 +377,18 @@ css = '.output-image{height: 713px !important}'
 # Example images
 os.system("wget https://i.imgur.com/c9ObJdK.jpg")
-examples = [['c9ObJdK.jpg', 32, 32, 32, 0, True, 1.0, 98]]
 gr.Interface(
     fn=inference,
     inputs=[
         gr.inputs.Image(label='RGB input image', type='filepath'),
         gr.inputs.Slider(label='Number of RGB input tokens', default=32, step=1, minimum=0, maximum=196),
         gr.inputs.Slider(label='Number of depth input tokens', default=32, step=1, minimum=0, maximum=196),
         gr.inputs.Slider(label='Number of semantic input tokens', default=32, step=1, minimum=0, maximum=196),
         gr.inputs.Number(label='Random seed: Change this to sample different masks', default=0),
-        gr.inputs.Checkbox(label='Randomize the number of tokens: Check this to ignore the above sliders and randomly sample the number \
-                                  of tokens per modality using the parameters below', default=False),
-        gr.inputs.Slider(label='Symmetric Dirichlet concentration parameter (α > 0). Low values (α << 1.0) result in a sampling behavior, \
-                                where most of the time, all visible tokens will be sampled from a single modality. High values \
-                                (α >> 1.0) result in similar numbers of tokens being sampled for each modality. α = 1.0 is equivalent \
-                                to uniform sampling over the simplex and contains both previous cases and everything in between.',
-                                default=1.0, step=0.1, minimum=0.1, maximum=5.0),
-        gr.inputs.Slider(label='Number of input tokens', default=98, step=1, minimum=0, maximum=588),
     ],
     outputs=[
         gr.outputs.Image(label='MultiMAE predictions', type='file')

 from tqdm import tqdm
 import random
 from functools import partial
+import time
 # import some common detectron2 utilities
 from detectron2 import model_zoo
     plt.close()
+def inference(img, num_tokens, perform_sampling, num_rgb, num_depth, num_semseg, seed):
     im = Image.open(img)
     # Center crop and resize RGB
     input_dict = {k: v.to(device) for k,v in input_dict.items()}
     if perform_sampling:
         # Randomly sample masks
+        torch.manual_seed(int(time.time())) # Random mode is random
         preds, masks = multimae.forward(
             input_dict,
             mask_inputs=True, # True if forward pass should sample random masks
             num_encoded_tokens=num_tokens,
+            alphas=1.0
         )
     else:
         # Randomly sample masks using the specified number of tokens per modality
+        torch.manual_seed(int(seed)) # change seed to resample new mask
         task_masks = {domain: torch.ones(1,196).long().to(device) for domain in DOMAINS}
         selected_rgb_idxs = torch.randperm(196)[:num_rgb]
         selected_depth_idxs = torch.randperm(196)[:num_depth]
 description = "Gradio demo for MultiMAE: Multi-modal Multi-task Masked Autoencoders. \
     Upload your own images or try one of the examples below to explore the multi-modal masked reconstruction of a pre-trained MultiMAE model. \
     Uploaded images are pseudo labeled using a DPT trained on Omnidata depth, and a Mask2Former trained on COCO. \
+    Choose the number of visible tokens using the sliders below and see how MultiMAE reconstructs the modalities!"
 article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2204.01678' \
            target='_blank'>MultiMAE: Multi-modal Multi-task Masked Autoencoders</a> | \
 # Example images
 os.system("wget https://i.imgur.com/c9ObJdK.jpg")
+examples = [['c9ObJdK.jpg', 98, False, 32, 32, 32, 0]]
 gr.Interface(
     fn=inference,
     inputs=[
         gr.inputs.Image(label='RGB input image', type='filepath'),
+        gr.inputs.Slider(label='Number of input tokens', default=98, step=1, minimum=0, maximum=588),
+        gr.inputs.Checkbox(label='Manual mode: Check this to manually set the number of input tokens per modality using the sliders below', default=False),
         gr.inputs.Slider(label='Number of RGB input tokens', default=32, step=1, minimum=0, maximum=196),
         gr.inputs.Slider(label='Number of depth input tokens', default=32, step=1, minimum=0, maximum=196),
         gr.inputs.Slider(label='Number of semantic input tokens', default=32, step=1, minimum=0, maximum=196),
         gr.inputs.Number(label='Random seed: Change this to sample different masks', default=0),
     ],
     outputs=[
         gr.outputs.Image(label='MultiMAE predictions', type='file')