import logging from pathlib import Path from typing import Union from .pdf_parser import PDFParser from .url_parser import URLParser logger = logging.getLogger(__name__) class InputProcessor: """ Main entry point for processing different types of inputs. """ @staticmethod def process(input_type: str, input_data: str) -> str: """ Process input based on type. Args: input_type: 'text', 'pdf', or 'url' input_data: The actual text, file path, or URL Returns: Extracted text content """ logger.info(f"Processing input type: {input_type}") if input_type == 'text': return input_data elif input_type == 'pdf': # Check if input_data is a file path is_path = False try: # Only check if it looks like a path (not too long) if len(str(input_data)) < 256 and Path(input_data).exists(): is_path = True except Exception: pass if is_path: return PDFParser.parse(input_data) # Try to decode as base64 import base64 import tempfile import os try: # Create a temporary file with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp: # If it's bytes, write directly. If str, decode. if isinstance(input_data, bytes): tmp.write(input_data) else: # Handle potential header "data:application/pdf;base64," if "," in input_data: input_data = input_data.split(",")[1] tmp.write(base64.b64decode(input_data)) tmp_path = tmp.name logger.info(f"Saved base64 PDF to temporary file: {tmp_path}") try: text = PDFParser.parse(tmp_path) return text finally: # Cleanup temp file try: os.unlink(tmp_path) except Exception as e: logger.warning(f"Failed to delete temp PDF file: {e}") except Exception as e: logger.error(f"Failed to process PDF input: {e}") raise ValueError(f"Invalid PDF input: {e}") elif input_type == 'url': return URLParser.parse(input_data) else: raise ValueError(f"Unsupported input type: {input_type}")