from typing import Dict, List, Any from ultralytics import YOLO import base64 from io import BytesIO from PIL import Image class EndpointHandler: def __init__(self, path=""): # Load the YOLO model self.model = YOLO(f"{path}/FFDNet-L.pt") self.id_to_cls = {0: "TextBox", 1: "ChoiceButton", 2: "Signature"} def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: """ Args: data: A dictionary containing: - "inputs": base64 encoded image or image URL - "parameters": optional dict with confidence, iou, imgsz Returns: List of predictions with bounding boxes and classes """ # Extract image from request inputs = data.pop("inputs", data) parameters = data.pop("parameters", {}) # Handle image input (base64 or URL) if isinstance(inputs, str): if inputs.startswith("http"): image = inputs else: # Decode base64 image_data = base64.b64decode(inputs) image = Image.open(BytesIO(image_data)) else: image = inputs # Get parameters with defaults confidence = parameters.get("conf", 0.3) iou = parameters.get("iou", 0.1) imgsz = parameters.get("imgsz", 1600) augment = parameters.get("augment", True) # Run inference results = self.model.predict( image, conf=confidence, iou=iou, imgsz=imgsz, augment=augment ) # Format results predictions = [] for result in results: if result.boxes is not None: for box in result.boxes.cpu().numpy(): x, y, w, h = box.xywhn[0] cls_id = int(box.cls.item()) predictions.append({ "widget_type": self.id_to_cls[cls_id], "confidence": float(box.conf[0]), "bounding_box": { "cx": float(x), "cy": float(y), "w": float(w), "h": float(h) } }) return predictions