Test1 / app.py
npro65's picture
Update app.py
31bf61a verified
import torch
from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration
from PIL import Image
import gradio as gr
# 1. Hugging Face model ID
MODEL_ID = "Salesforce/instructblip-vicuna-7b"
# 2. Load processor and model
processor = InstructBlipProcessor.from_pretrained(MODEL_ID)
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.float16 if device == "cuda" else torch.float32
model = InstructBlipForConditionalGeneration.from_pretrained(
MODEL_ID,
torch_dtype=dtype,
)
model = model.to(device)
# 3. Define the function that Gradio will call
def image_qa_pipeline(image, question):
if image is None:
return "Please upload an image."
if not question or question.strip() == "":
question = "Describe this image in detail."
# Preprocess
inputs = processor(images=image, text=question, return_tensors="pt").to(device)
# Generate
with torch.no_grad():
output_ids = model.generate(
**inputs,
max_new_tokens=64,
num_beams=3,
)
answer = processor.tokenizer.decode(output_ids[0], skip_special_tokens=True)
return answer
# 4. Build the Gradio interface
demo = gr.Interface(
fn=image_qa_pipeline,
inputs=[
gr.Image(type="pil", label="Upload an image"),
gr.Textbox(label="Question about the image"),
],
outputs=gr.Textbox(label="Answer"),
title="Multimodal Image Q&A Assistant",
description="Upload an image and ask a question about it."
)
# 5. Launch
if __name__ == "__main__":
demo.launch()