import torch
from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration
from PIL import Image
import gradio as gr

# 1. Hugging Face model ID 
MODEL_ID = "Salesforce/instructblip-vicuna-7b"

# 2. Load processor and model
processor = InstructBlipProcessor.from_pretrained(MODEL_ID)

device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.float16 if device == "cuda" else torch.float32

model = InstructBlipForConditionalGeneration.from_pretrained(
    MODEL_ID,
    torch_dtype=dtype,
)

model = model.to(device)

# 3. Define the function that Gradio will call
def image_qa_pipeline(image, question):
    if image is None:
        return "Please upload an image."

    if not question or question.strip() == "":
        question = "Describe this image in detail."

    # Preprocess
    inputs = processor(images=image, text=question, return_tensors="pt").to(device)

    # Generate
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=64,   
            num_beams=3,          
        )

    answer = processor.tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return answer

# 4. Build the Gradio interface
demo = gr.Interface(
    fn=image_qa_pipeline,
    inputs=[
        gr.Image(type="pil", label="Upload an image"),
        gr.Textbox(label="Question about the image"),
    ],
    outputs=gr.Textbox(label="Answer"),
    title="Multimodal Image Q&A Assistant",
    description="Upload an image and ask a question about it."
)

# 5. Launch 
if __name__ == "__main__":
    demo.launch()