Spaces:

silentchen
/

layout-guidance

Runtime error

App Files Files Community

silentchen commited on Jun 19, 2024

Commit

848367f

verified ·

1 Parent(s): 4432a94

Upload app.py

Browse files

Files changed (1) hide show

app.py +143 -134

app.py CHANGED Viewed

@@ -11,7 +11,7 @@ import math
 from utils import compute_ca_loss
 from gradio import processing_utils
 from typing import Optional
 import warnings
 import sys
@@ -67,96 +67,7 @@ def draw_box(boxes=[], texts=[], img=None):
                   fill=(255, 255, 255))
     return img
-'''
-inference model
-'''
-def inference(device, unet, vae, tokenizer, text_encoder, prompt, bboxes, object_positions, batch_size, loss_scale, loss_threshold, max_iter, max_index_step, rand_seed, guidance_scale):
-    uncond_input = tokenizer(
-        [""] * 1, padding="max_length", max_length=tokenizer.model_max_length, return_tensors="pt"
-    )
-    uncond_embeddings = text_encoder(uncond_input.input_ids.to(device))[0]
-    input_ids = tokenizer(
-            prompt,
-            padding="max_length",
-            truncation=True,
-            max_length=tokenizer.model_max_length,
-            return_tensors="pt",
-        ).input_ids[0].unsqueeze(0).to(device)
-    # text_embeddings = text_encoder(input_ids)[0]
-    text_embeddings = torch.cat([uncond_embeddings, text_encoder(input_ids)[0]])
-    # text_embeddings[1, 1, :] = text_embeddings[1, 2, :]
-    generator = torch.manual_seed(rand_seed)  # Seed generator to create the inital latent noise
-    latents = torch.randn(
-        (batch_size, 4, 64, 64),
-        generator=generator,
-    ).to(device)
-    noise_scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000)
-    # generator = torch.Generator("cuda").manual_seed(1024)
-    noise_scheduler.set_timesteps(51)
-    latents = latents * noise_scheduler.init_noise_sigma
-    loss = torch.tensor(10000)
-    for index, t in enumerate(noise_scheduler.timesteps):
-        iteration = 0
-        while loss.item() / loss_scale > loss_threshold and iteration < max_iter and index < max_index_step:
-            latents = latents.requires_grad_(True)
-            # latent_model_input = torch.cat([latents] * 2)
-            latent_model_input = latents
-            latent_model_input = noise_scheduler.scale_model_input(latent_model_input, t)
-            noise_pred, attn_map_integrated_up, attn_map_integrated_mid, attn_map_integrated_down = \
-                unet(latent_model_input, t, encoder_hidden_states=text_encoder(input_ids)[0])
-            # update latents with guidence from gaussian blob
-            loss = compute_ca_loss(attn_map_integrated_mid, attn_map_integrated_up, bboxes=bboxes,
-                                   object_positions=object_positions) * loss_scale
-            print(loss.item() / loss_scale)
-            grad_cond = torch.autograd.grad(loss.requires_grad_(True), [latents])[0]
-            latents = latents - grad_cond * noise_scheduler.sigmas[index] ** 2
-            iteration += 1
-            torch.cuda.empty_cache()
-        torch.cuda.empty_cache()
-        with torch.no_grad():
-            latent_model_input = torch.cat([latents] * 2)
-            latent_model_input = noise_scheduler.scale_model_input(latent_model_input, t)
-            noise_pred, attn_map_integrated_up, attn_map_integrated_mid, attn_map_integrated_down = \
-                unet(latent_model_input, t, encoder_hidden_states=text_embeddings)
-            noise_pred = noise_pred.sample
-            # perform classifier-free guidance
-            noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-            noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-            latents = noise_scheduler.step(noise_pred, t, latents).prev_sample
-            torch.cuda.empty_cache()
-    # Decode image
-    with torch.no_grad():
-        # print("decode image")
-        latents = 1 / 0.18215 * latents
-        image = vae.decode(latents).sample
-        image = (image / 2 + 0.5).clamp(0, 1)
-        image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
-        images = (image * 255).round().astype("uint8")
-        pil_images = [Image.fromarray(image) for image in images]
-        return pil_images
 def get_concat(ims):
     if len(ims) == 1:
@@ -172,42 +83,6 @@ def get_concat(ims):
     return dst
-def generate(unet, vae, tokenizer, text_encoder, language_instruction, grounding_texts, sketch_pad,
-             loss_threshold, guidance_scale, batch_size, rand_seed, max_step, loss_scale, max_iter,
-             state):
-    if 'boxes' not in state:
-        state['boxes'] = []
-    boxes = state['boxes']
-    grounding_texts = [x.strip() for x in grounding_texts.split(';')]
-    # assert len(boxes) == len(grounding_texts)
-    if len(boxes) != len(grounding_texts):
-        if len(boxes) < len(grounding_texts):
-            raise ValueError("""The number of boxes should be equal to the number of grounding objects.
-Number of boxes drawn: {}, number of grounding tokens: {}.
-Please draw boxes accordingly on the sketch pad.""".format(len(boxes), len(grounding_texts)))
-        grounding_texts = grounding_texts + [""] * (len(boxes) - len(grounding_texts))
-    boxes = (np.asarray(boxes) / 512).tolist()
-    boxes = [[box] for box in boxes]
-    grounding_instruction = json.dumps({obj: box for obj, box in zip(grounding_texts, boxes)})
-    language_instruction_list = language_instruction.strip('.').split(' ')
-    object_positions = []
-    for obj in grounding_texts:
-        obj_position = []
-        for word in obj.split(' '):
-            obj_first_index = language_instruction_list.index(word) + 1
-            obj_position.append(obj_first_index)
-        object_positions.append(obj_position)
-    device = 'cuda' if torch.cuda.is_available() else 'cpu'
-    gen_images = inference(device, unet, vae, tokenizer, text_encoder, language_instruction, boxes, object_positions, batch_size, loss_scale, loss_threshold, max_iter, max_step, rand_seed, guidance_scale)
-    blank_samples = batch_size % 2 if batch_size > 1 else 0
-    gen_images = [gr.Image.update(value=x, visible=True) for i, x in enumerate(gen_images)] \
-                 + [gr.Image.update(value=None, visible=True) for _ in range(blank_samples)] \
-                 + [gr.Image.update(value=None, visible=False) for _ in range(4 - batch_size - blank_samples)]
-    return gen_images + [state]
 def binarize(x):
@@ -251,8 +126,9 @@ def center_crop(img, HW=None, tgt_size=(512, 512)):
 def draw(input, grounding_texts, new_image_trigger, state):
     if type(input) == dict:
-        image = input['image']
-        mask = input['mask']
     else:
         mask = input
     if mask.ndim == 3:
@@ -307,7 +183,7 @@ def clear(task, sketch_pad_trigger, batch_size, state, switch_task=False):
     if task != 'Grounded Inpainting':
         sketch_pad_trigger = sketch_pad_trigger + 1
     blank_samples = batch_size % 2 if batch_size > 1 else 0
-    out_images = [gr.Image.update(value=None, visible=True) for i in range(batch_size)]
     # state = {}
     return [None, sketch_pad_trigger, None, 1.0] + out_images + [{}]
@@ -387,6 +263,139 @@ def main():
     text_encoder.to(device)
     vae.to(device)
     with Blocks(
             css=css,
             analytics_enabled=False,
@@ -418,7 +427,7 @@ def main():
             with gr.Row():
-                sketch_pad = gr.Paint(label="Sketch Pad", elem_id="img2img_image", source='canvas', shape=(512, 512))
                 out_imagebox = gr.Image(type="pil", label="Parsed Sketch Pad")
                 out_gen_1 = gr.Image(type="pil", visible=True, label="Generated Image")
@@ -479,7 +488,7 @@ def main():
                 inputs=sketch_pad_trigger,
                 outputs=sketch_pad_trigger,
                 queue=False)
-            sketch_pad.edit(
                 draw,
                 inputs=[sketch_pad, grounding_instruction, sketch_pad_resize_trigger, state],
                 outputs=[out_imagebox, sketch_pad_resize_trigger, image_scale, state],
@@ -519,13 +528,13 @@ def main():
                 None,
                 None,
                 sketch_pad_resize_trigger,
-                _js=rescale_js,
                 queue=False)
             init_white_trigger.change(
                 None,
                 None,
                 init_white_trigger,
-                _js=rescale_js,
                 queue=False)
         with gr.Column():
@@ -546,7 +555,7 @@ def main():
         description = """<p> The source codes of the demo are modified based on the <a href="https://huggingface.co/spaces/gligen/demo/tree/main">GlIGen</a>. Thanks! </p>"""
         gr.HTML(description)
-    demo.queue(concurrency_count=1, api_open=False)
     demo.launch(share=False, show_api=False, show_error=True)
 if __name__ == '__main__':

 from utils import compute_ca_loss
 from gradio import processing_utils
 from typing import Optional
+import spaces
 import warnings
 import sys
                   fill=(255, 255, 255))
     return img
 def get_concat(ims):
     if len(ims) == 1:
     return dst
 def binarize(x):
 def draw(input, grounding_texts, new_image_trigger, state):
     if type(input) == dict:
+        # import pdb; pdb.set_trace()
+        # image = input['composite']
+        mask = input['composite']
     else:
         mask = input
     if mask.ndim == 3:
     if task != 'Grounded Inpainting':
         sketch_pad_trigger = sketch_pad_trigger + 1
     blank_samples = batch_size % 2 if batch_size > 1 else 0
+    out_images = [gr.Image.change(value=None, visible=True) for i in range(batch_size)]
     # state = {}
     return [None, sketch_pad_trigger, None, 1.0] + out_images + [{}]
     text_encoder.to(device)
     vae.to(device)
+    def generate(unet, vae, tokenizer, text_encoder, language_instruction, grounding_texts, sketch_pad,
+                 loss_threshold, guidance_scale, batch_size, rand_seed, max_step, loss_scale, max_iter,
+                 state):
+        if 'boxes' not in state:
+            state['boxes'] = []
+        boxes = state['boxes']
+        grounding_texts = [x.strip() for x in grounding_texts.split(';')]
+        # assert len(boxes) == len(grounding_texts)
+        if len(boxes) != len(grounding_texts):
+            if len(boxes) < len(grounding_texts):
+                raise ValueError("""The number of boxes should be equal to the number of grounding objects.
+    Number of boxes drawn: {}, number of grounding tokens: {}.
+    Please draw boxes accordingly on the sketch pad.""".format(len(boxes), len(grounding_texts)))
+            grounding_texts = grounding_texts + [""] * (len(boxes) - len(grounding_texts))
+        boxes = (np.asarray(boxes) / 512).tolist()
+        boxes = [[box] for box in boxes]
+        grounding_instruction = json.dumps({obj: box for obj, box in zip(grounding_texts, boxes)})
+        language_instruction_list = language_instruction.strip('.').split(' ')
+        object_positions = []
+        for obj in grounding_texts:
+            obj_position = []
+            for word in obj.split(' '):
+                obj_first_index = language_instruction_list.index(word) + 1
+                obj_position.append(obj_first_index)
+            object_positions.append(obj_position)
+        device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        gen_images = inference(device, unet, vae, tokenizer, text_encoder, language_instruction, boxes,
+                               object_positions, batch_size, loss_scale, loss_threshold, max_iter, max_step, rand_seed,
+                               guidance_scale)
+        blank_samples = batch_size % 2 if batch_size > 1 else 0
+        gen_images = [gr.Image.update(value=x, visible=True) for i, x in enumerate(gen_images)] \
+                     + [gr.Image.change(fn=None, show_api=True) for _ in range(blank_samples)] \
+                     + [gr.Image.change(fn=None, show_api=False) for _ in range(4 - batch_size - blank_samples)]
+        return gen_images + [state]
+    '''
+    inference model
+    '''
+    @spaces.GPU(duration=180)
+    def inference(device, unet, vae, tokenizer, text_encoder, prompt, bboxes, object_positions, batch_size, loss_scale,
+                  loss_threshold, max_iter, max_index_step, rand_seed, guidance_scale):
+        uncond_input = tokenizer(
+            [""] * 1, padding="max_length", max_length=tokenizer.model_max_length, return_tensors="pt"
+        )
+        uncond_embeddings = text_encoder(uncond_input.input_ids.to(device))[0]
+        input_ids = tokenizer(
+            prompt,
+            padding="max_length",
+            truncation=True,
+            max_length=tokenizer.model_max_length,
+            return_tensors="pt",
+        ).input_ids[0].unsqueeze(0).to(device)
+        # text_embeddings = text_encoder(input_ids)[0]
+        text_embeddings = torch.cat([uncond_embeddings, text_encoder(input_ids)[0]])
+        # text_embeddings[1, 1, :] = text_embeddings[1, 2, :]
+        generator = torch.manual_seed(rand_seed)  # Seed generator to create the inital latent noise
+        latents = torch.randn(
+            (batch_size, 4, 64, 64),
+            generator=generator,
+        ).to(device)
+        noise_scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear",
+                                               num_train_timesteps=1000)
+        # generator = torch.Generator("cuda").manual_seed(1024)
+        noise_scheduler.set_timesteps(51)
+        latents = latents * noise_scheduler.init_noise_sigma
+        loss = torch.tensor(10000)
+        for index, t in enumerate(noise_scheduler.timesteps):
+            iteration = 0
+            while loss.item() / loss_scale > loss_threshold and iteration < max_iter and index < max_index_step:
+                latents = latents.requires_grad_(True)
+                # latent_model_input = torch.cat([latents] * 2)
+                latent_model_input = latents
+                latent_model_input = noise_scheduler.scale_model_input(latent_model_input, t)
+                noise_pred, attn_map_integrated_up, attn_map_integrated_mid, attn_map_integrated_down = \
+                    unet(latent_model_input, t, encoder_hidden_states=text_encoder(input_ids)[0])
+                # update latents with guidence from gaussian blob
+                loss = compute_ca_loss(attn_map_integrated_mid, attn_map_integrated_up, bboxes=bboxes,
+                                       object_positions=object_positions) * loss_scale
+                print(loss.item() / loss_scale)
+                grad_cond = torch.autograd.grad(loss.requires_grad_(True), [latents])[0]
+                latents = latents - grad_cond * noise_scheduler.sigmas[index] ** 2
+                iteration += 1
+                torch.cuda.empty_cache()
+            torch.cuda.empty_cache()
+            with torch.no_grad():
+                latent_model_input = torch.cat([latents] * 2)
+                latent_model_input = noise_scheduler.scale_model_input(latent_model_input, t)
+                noise_pred, attn_map_integrated_up, attn_map_integrated_mid, attn_map_integrated_down = \
+                    unet(latent_model_input, t, encoder_hidden_states=text_embeddings)
+                noise_pred = noise_pred.sample
+                # perform classifier-free guidance
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                latents = noise_scheduler.step(noise_pred, t, latents).prev_sample
+                torch.cuda.empty_cache()
+        # Decode image
+        with torch.no_grad():
+            # print("decode image")
+            latents = 1 / 0.18215 * latents
+            image = vae.decode(latents).sample
+            image = (image / 2 + 0.5).clamp(0, 1)
+            image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
+            images = (image * 255).round().astype("uint8")
+            pil_images = [Image.fromarray(image) for image in images]
+            return pil_images
     with Blocks(
             css=css,
             analytics_enabled=False,
             with gr.Row():
+                sketch_pad = gr.Paint(label="Sketch Pad", container=False, layers=False, scale=1, elem_id="img2img_image")
                 out_imagebox = gr.Image(type="pil", label="Parsed Sketch Pad")
                 out_gen_1 = gr.Image(type="pil", visible=True, label="Generated Image")
                 inputs=sketch_pad_trigger,
                 outputs=sketch_pad_trigger,
                 queue=False)
+            sketch_pad.change(
                 draw,
                 inputs=[sketch_pad, grounding_instruction, sketch_pad_resize_trigger, state],
                 outputs=[out_imagebox, sketch_pad_resize_trigger, image_scale, state],
                 None,
                 None,
                 sketch_pad_resize_trigger,
+                js=rescale_js,
                 queue=False)
             init_white_trigger.change(
                 None,
                 None,
                 init_white_trigger,
+                js=rescale_js,
                 queue=False)
         with gr.Column():
         description = """<p> The source codes of the demo are modified based on the <a href="https://huggingface.co/spaces/gligen/demo/tree/main">GlIGen</a>. Thanks! </p>"""
         gr.HTML(description)
+    demo.queue(api_open=False)
     demo.launch(share=False, show_api=False, show_error=True)
 if __name__ == '__main__':