File size: 3,850 Bytes
36d75a9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import torch
import gradio as gr

from PIL import Image
import numpy as np

from transformers import CLIPImageProcessor, CLIPVisionModel
from diffusers import AutoencoderKL, DDPMScheduler
from src.diffusers.models.referencenet.referencenet_unet_2d_condition import (
    ReferenceNetModel,
)
from src.diffusers.models.referencenet.unet_2d_condition import UNet2DConditionModel
from src.diffusers.pipelines.referencenet.pipeline_referencenet import (
    StableDiffusionReferenceNetPipeline,
)
from utils.anonymize_faces_in_image import anonymize_faces_in_image
import face_alignment


DEVICE = "cuda" if torch.cuda.is_available() else "cpu"


def load_pipeline():
    face_model_id = "hkung/face-anon-simple"
    clip_model_id = "openai/clip-vit-large-patch14"
    sd_model_id = "stabilityai/stable-diffusion-2-1"

    unet = UNet2DConditionModel.from_pretrained(
        face_model_id, subfolder="unet", use_safetensors=True
    )
    referencenet = ReferenceNetModel.from_pretrained(
        face_model_id, subfolder="referencenet", use_safetensors=True
    )
    conditioning_referencenet = ReferenceNetModel.from_pretrained(
        face_model_id, subfolder="conditioning_referencenet", use_safetensors=True
    )
    vae = AutoencoderKL.from_pretrained(
        sd_model_id, subfolder="vae", use_safetensors=True
    )
    scheduler = DDPMScheduler.from_pretrained(
        sd_model_id, subfolder="scheduler", use_safetensors=True
    )
    feature_extractor = CLIPImageProcessor.from_pretrained(
        clip_model_id, use_safetensors=True
    )
    image_encoder = CLIPVisionModel.from_pretrained(
        clip_model_id, use_safetensors=True
    )

    pipe = StableDiffusionReferenceNetPipeline(
        unet=unet,
        referencenet=referencenet,
        conditioning_referencenet=conditioning_referencenet,
        vae=vae,
        feature_extractor=feature_extractor,
        image_encoder=image_encoder,
        scheduler=scheduler,
    )

    pipe = pipe.to(DEVICE)
    return pipe


# Load heavy stuff once at startup (better UX + energy-wise)
pipe = load_pipeline()
generator = torch.manual_seed(1)

fa = face_alignment.FaceAlignment(
    face_alignment.LandmarksType.TWO_D,
    face_detector="sfd",
    device=DEVICE,
)


def anonymize(
    image: np.ndarray,
    anonymization_degree: float = 1.25,
    num_inference_steps: int = 25,
    guidance_scale: float = 4.0,
):
    """
    Gradio callback: takes an RGB numpy image and returns anonymized PIL image.
    """

    if image is None:
        return None

    pil_image = Image.fromarray(image)

    anon_image = anonymize_faces_in_image(
        image=pil_image,
        face_alignment=fa,
        pipe=pipe,
        generator=generator,
        face_image_size=512,
        num_inference_steps=int(num_inference_steps),
        guidance_scale=float(guidance_scale),
        anonymization_degree=float(anonymization_degree),
    )

    return anon_image


demo = gr.Interface(
    fn=anonymize,
    inputs=[
        gr.Image(type="numpy", label="Input image"),
        gr.Slider(
            minimum=0.5,
            maximum=2.0,
            step=0.05,
            value=1.25,
            label="Anonymization strength",
        ),
        gr.Slider(
            minimum=10,
            maximum=50,
            step=1,
            value=25,
            label="Diffusion steps (speed vs quality)",
        ),
        gr.Slider(
            minimum=1.0,
            maximum=10.0,
            step=0.1,
            value=4.0,
            label="Guidance scale",
        ),
    ],
    outputs=gr.Image(type="pil", label="Anonymized image"),
    title="Face Anonymization Made Simple",
    description=(
        "Upload a photo and anonymize all faces using the WACV 2025 "
        "\"Face Anonymization Made Simple\" model."
    ),
)


if __name__ == "__main__":
    demo.launch()