import torch import gradio as gr from PIL import Image import numpy as np from transformers import CLIPImageProcessor, CLIPVisionModel from diffusers import AutoencoderKL, DDPMScheduler from src.diffusers.models.referencenet.referencenet_unet_2d_condition import ( ReferenceNetModel, ) from src.diffusers.models.referencenet.unet_2d_condition import UNet2DConditionModel from src.diffusers.pipelines.referencenet.pipeline_referencenet import ( StableDiffusionReferenceNetPipeline, ) from utils.anonymize_faces_in_image import anonymize_faces_in_image import face_alignment DEVICE = "cuda" if torch.cuda.is_available() else "cpu" def load_pipeline(): face_model_id = "hkung/face-anon-simple" clip_model_id = "openai/clip-vit-large-patch14" sd_model_id = "stabilityai/stable-diffusion-2-1" unet = UNet2DConditionModel.from_pretrained( face_model_id, subfolder="unet", use_safetensors=True ) referencenet = ReferenceNetModel.from_pretrained( face_model_id, subfolder="referencenet", use_safetensors=True ) conditioning_referencenet = ReferenceNetModel.from_pretrained( face_model_id, subfolder="conditioning_referencenet", use_safetensors=True ) vae = AutoencoderKL.from_pretrained( sd_model_id, subfolder="vae", use_safetensors=True ) scheduler = DDPMScheduler.from_pretrained( sd_model_id, subfolder="scheduler", use_safetensors=True ) feature_extractor = CLIPImageProcessor.from_pretrained( clip_model_id, use_safetensors=True ) image_encoder = CLIPVisionModel.from_pretrained( clip_model_id, use_safetensors=True ) pipe = StableDiffusionReferenceNetPipeline( unet=unet, referencenet=referencenet, conditioning_referencenet=conditioning_referencenet, vae=vae, feature_extractor=feature_extractor, image_encoder=image_encoder, scheduler=scheduler, ) pipe = pipe.to(DEVICE) return pipe # Load heavy stuff once at startup (better UX + energy-wise) pipe = load_pipeline() generator = torch.manual_seed(1) fa = face_alignment.FaceAlignment( face_alignment.LandmarksType.TWO_D, face_detector="sfd", device=DEVICE, ) def anonymize( image: np.ndarray, anonymization_degree: float = 1.25, num_inference_steps: int = 25, guidance_scale: float = 4.0, ): """ Gradio callback: takes an RGB numpy image and returns anonymized PIL image. """ if image is None: return None pil_image = Image.fromarray(image) anon_image = anonymize_faces_in_image( image=pil_image, face_alignment=fa, pipe=pipe, generator=generator, face_image_size=512, num_inference_steps=int(num_inference_steps), guidance_scale=float(guidance_scale), anonymization_degree=float(anonymization_degree), ) return anon_image demo = gr.Interface( fn=anonymize, inputs=[ gr.Image(type="numpy", label="Input image"), gr.Slider( minimum=0.5, maximum=2.0, step=0.05, value=1.25, label="Anonymization strength", ), gr.Slider( minimum=10, maximum=50, step=1, value=25, label="Diffusion steps (speed vs quality)", ), gr.Slider( minimum=1.0, maximum=10.0, step=0.1, value=4.0, label="Guidance scale", ), ], outputs=gr.Image(type="pil", label="Anonymized image"), title="Face Anonymization Made Simple", description=( "Upload a photo and anonymize all faces using the WACV 2025 " "\"Face Anonymization Made Simple\" model." ), ) if __name__ == "__main__": demo.launch()