|
|
import torch |
|
|
from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration |
|
|
from PIL import Image |
|
|
import gradio as gr |
|
|
|
|
|
|
|
|
MODEL_ID = "Salesforce/instructblip-vicuna-7b" |
|
|
|
|
|
|
|
|
processor = InstructBlipProcessor.from_pretrained(MODEL_ID) |
|
|
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
dtype = torch.float16 if device == "cuda" else torch.float32 |
|
|
|
|
|
model = InstructBlipForConditionalGeneration.from_pretrained( |
|
|
MODEL_ID, |
|
|
torch_dtype=dtype, |
|
|
) |
|
|
|
|
|
model = model.to(device) |
|
|
|
|
|
|
|
|
def image_qa_pipeline(image, question): |
|
|
if image is None: |
|
|
return "Please upload an image." |
|
|
|
|
|
if not question or question.strip() == "": |
|
|
question = "Describe this image in detail." |
|
|
|
|
|
|
|
|
inputs = processor(images=image, text=question, return_tensors="pt").to(device) |
|
|
|
|
|
|
|
|
with torch.no_grad(): |
|
|
output_ids = model.generate( |
|
|
**inputs, |
|
|
max_new_tokens=64, |
|
|
num_beams=3, |
|
|
) |
|
|
|
|
|
answer = processor.tokenizer.decode(output_ids[0], skip_special_tokens=True) |
|
|
return answer |
|
|
|
|
|
|
|
|
demo = gr.Interface( |
|
|
fn=image_qa_pipeline, |
|
|
inputs=[ |
|
|
gr.Image(type="pil", label="Upload an image"), |
|
|
gr.Textbox(label="Question about the image"), |
|
|
], |
|
|
outputs=gr.Textbox(label="Answer"), |
|
|
title="Multimodal Image Q&A Assistant", |
|
|
description="Upload an image and ask a question about it." |
|
|
) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |
|
|
|
|
|
|