Qwen-Image-2509-Full

Paused

File size: 9,440 Bytes

import gradio as gr
import numpy as np
import random
import torch
import spaces
from PIL import Image
from diffusers import FlowMatchEulerDiscreteScheduler
from optimization import optimize_pipeline_
from qwenimage.pipeline_qwenimage_edit_plus import QwenImageEditPlusPipeline
from qwenimage.transformer_qwenimage import QwenImageTransformer2DModel
from qwenimage.qwen_fa3_processor import QwenDoubleStreamAttnProcessorFA3
import math
import os
import tempfile
from huggingface_hub import hf_hub_download

# --- Model & Repo ---
HF_MODEL = os.environ.get("HF_UPLOAD_REPO", "rahul7star/qwen-edit-img-repo")
dtype = torch.bfloat16
device = "cuda" if torch.cuda.is_available() else "cpu"

# --- Camera prompts ---
BASE_PROMPTS = {
    "front": "Move the camera to a front-facing position showing the full character. Background is plain white.",
    "back": "Move the camera to a back-facing position showing the full character. Background is plain white.",
    "left": "Move the camera to a side (left) profile view. Background is plain white.",
    "right": "Move the camera to a side (right) profile view. Background is plain white.",
    "45_left": "Rotate camera 45° left showing the full character",
    "45_right": "Rotate camera 45° right showing the full character",
    #"90_left": "Rotate camera 90° left",
    #"90_right": "Rotate camera 90° right",
    "top_down": "Switch to top-down view showing the full character",
    "low_angle": "Switch to low-angle view",
    "close_up": "Switch to close-up lens",
    "medium_close_up": "Switch to medium close-up lens",
    "zoom_out": "Switch to zoom out lens",
}

# --- Resolution presets ---
RESOLUTIONS = {
    "1:4": (512, 2048),
    "1:3": (576, 1728),
    "nealy 9:16": (768, 1344),
    "nealy 2:3": (832, 1216),
    "3:4": (896, 1152),
}

MAX_SEED = np.iinfo(np.int32).max

# --- CPU-only upload function ---
def upload_image_and_prompt_cpu(input_image, prompt_text) -> str:
    from datetime import datetime
    import uuid, shutil
    from huggingface_hub import HfApi

    api = HfApi()
    print(prompt_text)
    today_str = datetime.now().strftime("%Y-%m-%d")
    unique_subfolder = f"Upload-Image-{uuid.uuid4().hex[:8]}"
    hf_folder = f"{today_str}/{unique_subfolder}"

    with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_img:
        if isinstance(input_image, str):
            shutil.copy(input_image, tmp_img.name)
        else:
            input_image.save(tmp_img.name, format="PNG")
        tmp_img_path = tmp_img.name

    api.upload_file(
        path_or_fileobj=tmp_img_path,
        path_in_repo=f"{hf_folder}/input_image.png",
        repo_id=HF_MODEL,
        repo_type="model",
        token=os.environ.get("HUGGINGFACE_HUB_TOKEN")
    )

    summary_file = tempfile.NamedTemporaryFile(delete=False, suffix=".txt").name
    with open(summary_file, "w", encoding="utf-8") as f:
        f.write(prompt_text)

    api.upload_file(
        path_or_fileobj=summary_file,
        path_in_repo=f"{hf_folder}/summary.txt",
        repo_id=HF_MODEL,
        repo_type="model",
        token=os.environ.get("HUGGINGFACE_HUB_TOKEN")
    )

    os.remove(tmp_img_path)
    os.remove(summary_file)
    return hf_folder

# --- Scheduler & model load ---
scheduler_config = {
    "base_image_seq_len": 256,
    "base_shift": math.log(3),
    "invert_sigmas": False,
    "max_image_seq_len": 8192,
    "max_shift": math.log(3),
    "num_train_timesteps": 1000,
    "shift": 1.0,
    "shift_terminal": None,
    "stochastic_sampling": False,
    "time_shift_type": "exponential",
    "use_beta_sigmas": False,
    "use_dynamic_shifting": True,
    "use_exponential_sigmas": False,
    "use_karras_sigmas": False,
}
scheduler = FlowMatchEulerDiscreteScheduler.from_config(scheduler_config)

pipe = QwenImageEditPlusPipeline.from_pretrained(
    "Qwen/Qwen-Image-Edit-2509",
    scheduler=scheduler,
    torch_dtype=dtype
).to(device)

# Load LoRA weights
pipe.load_lora_weights(
    "2vXpSwA7/iroiro-lora",
    weight_name="qwen_lora/Qwen-Image-Edit-2509-Lightning-4steps-V1.0-bf16_dim1.safetensors"
)
pipe.fuse_lora(lora_scale=1.0)

# pipe.load_lora_weights(
#     "rahul7star/qwen-char-lora",
#     weight_name="qwen_lora/qwen-multiple-char.safetensors",
# )
# pipe.fuse_lora(lora_scale=1.0)
pipe.unload_lora_weights()
pipe.transformer.__class__ = QwenImageTransformer2DModel
pipe.transformer.set_attn_processor(QwenDoubleStreamAttnProcessorFA3())
optimize_pipeline_(pipe, image=[Image.new("RGB", (1024, 1024)), Image.new("RGB", (1024, 1024))], prompt="prompt")

# --- Helpers ---
def _append_prompt(base: str, extra: str) -> str:
    extra = (extra or "").strip()
    return (base if not extra else f"{base} {extra}").strip()

def generate_single_view(input_images, prompt, seed, num_inference_steps, true_guidance_scale):
    generator = torch.Generator(device=device).manual_seed(seed)
    result = pipe(
        image=input_images if input_images else None,
        prompt=prompt,
        negative_prompt=" ",
        num_inference_steps=num_inference_steps,
        generator=generator,
        true_cfg_scale=true_guidance_scale,
        num_images_per_prompt=1,
    ).images
    
    return result[0]

def resize_to_preset(img: Image.Image, preset_key: str) -> Image.Image:
    w, h = RESOLUTIONS[preset_key]
    return img.resize((w, h), Image.LANCZOS)

def concat_images_horizontally(images, bg_color=(255, 255, 255)):
    images = [img.convert("RGB") for img in images if img is not None]
    if not images:
        return None
    h = max(img.height for img in images)
    resized = []
    for img in images:
        if img.height != h:
            w = int(img.width * (h / img.height))
            img = img.resize((w, h), Image.LANCZOS)
        resized.append(img)
    w_total = sum(img.width for img in resized)
    canvas = Image.new("RGB", (w_total, h), bg_color)
    x = 0
    for img in resized:
        canvas.paste(img, (x, 0))
        x += img.width
    return canvas

# --- Generate all camera angles dynamically ---
@spaces.GPU()
def generate_turnaround(
    image,
    extra_prompt="",
    preset_key="nealy 9:16",
    seed=42,
    randomize_seed=False,
    true_guidance_scale=1.0,
    num_inference_steps=4,
    progress=gr.Progress(track_tqdm=True),
):
    if randomize_seed:
        seed = random.randint(0, MAX_SEED)
    if image is None:
        return [None]*(len(BASE_PROMPTS)+1), seed, "❌ 入力画像をアップロードしてください"

    input_image = image.convert("RGB") if isinstance(image, Image.Image) else Image.open(image).convert("RGB")
    pil_images = [input_image]
    
    results = {}
    total = len(BASE_PROMPTS)
    for i, (key, base_prompt) in enumerate(BASE_PROMPTS.items(), start=1):
        progress(i/total, desc=f"{key} 生成中...")
        prompt_full = _append_prompt(base_prompt, extra_prompt)
        img = generate_single_view(pil_images, prompt_full, seed+i, num_inference_steps, true_guidance_scale)
        results[key] = resize_to_preset(img, preset_key)

    concat = concat_images_horizontally(list(results.values()))
    return [*results.values(), concat, seed, f"✅ {len(results)}視点の画像＋連結画像を生成しました"]

# --- UI ---
css = """
#col-container {margin: 0 auto; max-width: 1400px;}
.image-container img {object-fit: contain !important; max-width: 100%; max-height: 100%;}
.notice {background: #fff5f5; border: 1px solid #fca5a5; color: #7f1d1d; padding: 12px 14px; border-radius: 10px; font-weight: 600; line-height: 1.5; margin-bottom: 10px;}
"""

with gr.Blocks(css=css) as demo:
    with gr.Column(elem_id="col-container"):
        input_image = gr.Image(label="入力画像", type="pil", height=500)
        extra_prompt = gr.Textbox(
            label="追加プロンプト（各視点プロンプト末尾に追加）",
            placeholder="high detail, anime style, soft lighting, 4k",
            lines=2
        )
        preset_dropdown = gr.Dropdown(
            label="出力解像度プリセット",
            choices=list(RESOLUTIONS.keys()),
            value="nealy 9:16"
        )
        run_button = gr.Button("🎨 生成開始", variant="primary")
        status_text = gr.Textbox(label="ステータス", interactive=False)

        # Dynamic outputs for all angles
        result_images = []
        for key in BASE_PROMPTS.keys():
            result_images.append(gr.Image(label=key.capitalize(), type="pil", format="png", height=400, show_download_button=True))
        result_concat = gr.Image(label="連結画像（全視点）", type="pil", format="png", height=400, show_download_button=True)

        with gr.Accordion("⚙️ 詳細設定", open=False):
            seed_slider = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=0)
            randomize_seed_checkbox = gr.Checkbox(label="ランダムシード", value=True)
            guidance_scale_slider = gr.Slider(label="True guidance scale", minimum=1.0, maximum=10.0, step=0.1, value=1.0)
            num_steps_slider = gr.Slider(label="生成ステップ数", minimum=1, maximum=40, step=1, value=4)

    run_button.click(
        fn=generate_turnaround,
        inputs=[input_image, extra_prompt, preset_dropdown, seed_slider, randomize_seed_checkbox, guidance_scale_slider, num_steps_slider],
        outputs=[*result_images, result_concat, seed_slider, status_text]
    )

if __name__ == "__main__":
    demo.launch()