import torch from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration from PIL import Image import gradio as gr # 1. Hugging Face model ID MODEL_ID = "Salesforce/instructblip-vicuna-7b" # 2. Load processor and model processor = InstructBlipProcessor.from_pretrained(MODEL_ID) device = "cuda" if torch.cuda.is_available() else "cpu" dtype = torch.float16 if device == "cuda" else torch.float32 model = InstructBlipForConditionalGeneration.from_pretrained( MODEL_ID, torch_dtype=dtype, ) model = model.to(device) # 3. Define the function that Gradio will call def image_qa_pipeline(image, question): if image is None: return "Please upload an image." if not question or question.strip() == "": question = "Describe this image in detail." # Preprocess inputs = processor(images=image, text=question, return_tensors="pt").to(device) # Generate with torch.no_grad(): output_ids = model.generate( **inputs, max_new_tokens=64, num_beams=3, ) answer = processor.tokenizer.decode(output_ids[0], skip_special_tokens=True) return answer # 4. Build the Gradio interface demo = gr.Interface( fn=image_qa_pipeline, inputs=[ gr.Image(type="pil", label="Upload an image"), gr.Textbox(label="Question about the image"), ], outputs=gr.Textbox(label="Answer"), title="Multimodal Image Q&A Assistant", description="Upload an image and ask a question about it." ) # 5. Launch if __name__ == "__main__": demo.launch()