import logging import requests from bs4 import BeautifulSoup from readability import Document logger = logging.getLogger(__name__) class URLParser: """ Extracts main content from URLs. """ @staticmethod def parse(url: str) -> str: """ Extract main text content from a URL. """ if not url.startswith(('http://', 'https://')): url = 'https://' + url logger.info(f"Fetching URL: {url}") try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } response = requests.get(url, headers=headers, timeout=10) response.raise_for_status() # Use readability to extract the main article content doc = Document(response.text) summary_html = doc.summary() title = doc.title() # Clean up HTML to get plain text soup = BeautifulSoup(summary_html, 'html.parser') text = soup.get_text(separator='\n\n') # Clean up whitespace clean_text = "\n".join(line.strip() for line in text.splitlines() if line.strip()) full_content = f"Title: {title}\n\n{clean_text}" logger.info(f"Extracted {len(full_content)} characters from URL") return full_content except Exception as e: logger.error(f"Error parsing URL: {e}") raise RuntimeError(f"Failed to parse URL: {e}")