Spaces:

nivakaran
/

modelx

Running

App Files Files Community

modelx / src /utils /utils.py

nivakaran

Deploy from GitHub Actions

52329fa verified 10 days ago

raw

history blame contribute delete

175 kB

	# src/utils/utils.py
	"""
	COMPLETE - All scraping tools and utilities for Roger platform
	Updated:
	- Fixed Playwright Syntax Error (removed invalid 'request_timeout').
	- Added 'Requests-First' strategy for 10x faster scraping.
	- Added 'Rainfall' PDF detection for district-level rain data.
	- Captures ALL district/city rows from the forecast table.
	"""
	from urllib.parse import quote
	from datetime import datetime, timezone
	from typing import Optional, List, Dict, Any
	import os
	import logging
	import requests
	import json
	import io
	from langchain_core.tools import tool
	from bs4 import BeautifulSoup
	from urllib.parse import quote_plus, urljoin, urlparse
	import yfinance as yf
	import re
	import time
	import random


	def utc_now() -> datetime:
	"""Return current UTC time (Python 3.12+ compatible)."""
	return datetime.now(timezone.utc)


	# Optional Playwright import
	try:
	from playwright.sync_api import (
	sync_playwright,
	TimeoutError as PlaywrightTimeoutError,
	)

	PLAYWRIGHT_AVAILABLE = True
	except Exception:
	PLAYWRIGHT_AVAILABLE = False

	# Optional PDF Reader import
	try:
	from pypdf import PdfReader

	PDF_AVAILABLE = True
	except ImportError:
	PDF_AVAILABLE = False

	# ============================================
	# CONFIGURATION
	# ============================================

	DEFAULT_HEADERS = {
	"User-Agent": (
	"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
	"AppleWebKit/537.36 (KHTML, like Gecko) "
	"Chrome/120.0.0.0 Safari/537.36"
	),
	"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8",
	"Accept-Language": "en-US,en;q=0.9",
	}

	DEFAULT_TIMEOUT = int(os.getenv("DEFAULT_TIMEOUT", "30"))
	MAX_RETRIES = int(os.getenv("RETRY_ATTEMPTS", "3"))

	# Site-specific timeout configuration for slow sites
	SITE_TIMEOUTS = {
	"ft.lk": 45,
	"gazette.lk": 40,
	"meteo.gov.lk": 60,
	"parliament.lk": 40,
	}

	logger = logging.getLogger("Roger.utils")
	logger.setLevel(logging.INFO)


	# ============================================
	# UTILITIES
	# ============================================


	def get_today_str() -> str:
	return datetime.now().strftime("%a %b %d, %Y")


	def _get_site_timeout(url: str) -> int:
	"""Get site-specific timeout based on URL domain."""
	for domain, timeout in SITE_TIMEOUTS.items():
	if domain in url:
	return timeout
	return DEFAULT_TIMEOUT


	def _safe_get(
	url: str, timeout: int = None, headers: Optional[Dict[str, str]] = None
	) -> Optional[requests.Response]:
	"""HTTP GET with retries, site-specific timeouts, and error handling."""
	headers = headers or DEFAULT_HEADERS
	# Use site-specific timeout if not explicitly provided
	if timeout is None:
	timeout = _get_site_timeout(url)

	for attempt in range(MAX_RETRIES):
	try:
	resp = requests.get(url, headers=headers, timeout=timeout)
	if resp.status_code == 200:
	return resp
	logger.warning(f"[HTTP] {url} returned {resp.status_code}")
	except requests.exceptions.Timeout:
	logger.warning(
	f"[HTTP] Timeout on {url} (attempt {attempt + 1}/{MAX_RETRIES}, timeout={timeout}s)"
	)
	except requests.exceptions.RequestException as e:
	logger.error(f"[HTTP] Error fetching {url}: {e}")
	if attempt < MAX_RETRIES - 1:
	time.sleep(2**attempt)
	return None


	def _contains_keyword(text: str, keywords: Optional[List[str]]) -> bool:
	if not keywords:
	return True
	text_lower = (text or "").lower()
	return any(k.lower() in text_lower for k in keywords)


	def _extract_text_from_html(html: str, selector: str = "body") -> str:
	soup = BeautifulSoup(html, "html.parser")
	element = soup.select_one(selector) or soup.body
	return element.get_text(separator="\n", strip=True) if element else ""


	def _make_absolute(href: str, base: str) -> str:
	if not href:
	return base
	if href.startswith("//"):
	parsed = urlparse(base)
	return f"{parsed.scheme}:{href}"
	if href.startswith("/"):
	return urljoin(base, href)
	if href.startswith("http"):
	return href
	return urljoin(base, href)


	def _extract_text_from_pdf_url(pdf_url: str) -> str:
	"""
	Downloads a PDF from a URL and extracts its text content.
	Returns a summarized string of the content.

	ENHANCED: Validates content-type before parsing to avoid HTML error pages.
	"""
	if not PDF_AVAILABLE:
	return "[PDF Content: Install 'pypdf' to extract text]"

	try:
	# 1. Download the PDF bytes with proper headers
	headers = DEFAULT_HEADERS.copy()
	# Set appropriate referer based on URL domain
	if "gazette.lk" in pdf_url:
	headers["Referer"] = "https://www.gazette.lk/"
	elif "meteo.gov.lk" in pdf_url:
	headers["Referer"] = "https://meteo.gov.lk/"
	else:
	headers["Referer"] = pdf_url.rsplit("/", 1)[0]

	response = requests.get(
	pdf_url, headers=headers, timeout=30, allow_redirects=True
	)
	response.raise_for_status()

	# 2. CRITICAL: Validate content-type before parsing
	content_type = response.headers.get("Content-Type", "").lower()
	content_bytes = response.content[:20] # First 20 bytes for header check

	# Check if response is actually a PDF
	is_pdf_content_type = "application/pdf" in content_type
	is_pdf_header = content_bytes.startswith(b"%PDF")

	if not is_pdf_content_type and not is_pdf_header:
	# Check if we got HTML instead (common error response)
	if (
	content_bytes.startswith(b"<!DOC")
	or content_bytes.startswith(b"<html")
	or b"<HTML" in content_bytes
	):
	logger.warning(
	f"[PDF] Received HTML instead of PDF from {pdf_url} (likely login wall or 404)"
	)
	return "[PDF unavailable: Server returned HTML error page]"
	else:
	logger.warning(
	f"[PDF] Unknown content type for {pdf_url}: {content_type}"
	)
	return f"[PDF unavailable: Unexpected content type '{content_type}']"

	# 3. Read PDF from memory
	with io.BytesIO(response.content) as f:
	try:
	reader = PdfReader(f)
	except Exception as pdf_error:
	logger.warning(f"[PDF] Failed to parse PDF from {pdf_url}: {pdf_error}")
	return "[PDF unavailable: Could not parse PDF structure]"

	text_content = []

	# Extract text from ALL pages (no limit)
	for i, page in enumerate(reader.pages):
	try:
	text = page.extract_text()
	if text:
	text_content.append(text)
	except Exception as page_error:
	logger.debug(f"[PDF] Error extracting page {i}: {page_error}")
	continue

	if not text_content:
	return "[PDF extracted but contains no readable text]"

	full_text = "\n".join(text_content)

	# No language filtering - extract ALL text regardless of language
	full_text = re.sub(r"\n+", "\n", full_text).strip()
	return full_text # Return full text without length limit

	except requests.exceptions.Timeout:
	logger.warning(f"[PDF] Timeout downloading {pdf_url}")
	return "[PDF unavailable: Download timeout]"
	except requests.exceptions.HTTPError as e:
	logger.warning(f"[PDF] HTTP error for {pdf_url}: {e}")
	return f"[PDF unavailable: HTTP {e.response.status_code if e.response else 'error'}]"
	except Exception as e:
	logger.warning(f"[PDF] Failed to extract text from {pdf_url}: {e}")
	return f"[Error reading PDF: {str(e)}]"


	# ============================================
	# PLAYWRIGHT SESSION HELPERS
	# ============================================


	def ensure_playwright():
	if not PLAYWRIGHT_AVAILABLE:
	raise RuntimeError(
	"Playwright is not installed. Install with `pip install playwright` and run `playwright install`."
	)


	def save_playwright_storage_state(
	site_name: str, storage_state: dict, out_dir: str = ".sessions"
	) -> str:
	os.makedirs(out_dir, exist_ok=True)
	path = os.path.join(out_dir, f"{site_name}_storage_state.json")
	with open(path, "w", encoding="utf-8") as f:
	json.dump(storage_state, f)
	return path


	def load_playwright_storage_state_path(
	site_name: str, out_dir: str = ".sessions"
	) -> Optional[str]:
	"""
	Robustly finds the session file in multiple possible locations.
	Priority order:
	1. src/utils/.sessions/ (where session_manager.py saves them)
	2. .sessions/ (current working directory)
	3. Root project .sessions/
	"""
	filename = f"{site_name}_storage_state.json"

	# Priority 1: Check src/utils/.sessions/ (most likely location)
	src_utils_path = os.path.join(os.getcwd(), "src", "utils", out_dir, filename)
	if os.path.exists(src_utils_path):
	logger.info(f"[SESSION] ✅ Found session at {src_utils_path}")
	return src_utils_path

	# Priority 2: Check current working directory .sessions/
	cwd_path = os.path.join(os.getcwd(), out_dir, filename)
	if os.path.exists(cwd_path):
	logger.info(f"[SESSION] ✅ Found session at {cwd_path}")
	return cwd_path

	# Priority 3: Check project root .sessions/
	base_dir = os.path.dirname(
	os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
	)
	root_path = os.path.join(base_dir, out_dir, filename)
	if os.path.exists(root_path):
	logger.info(f"[SESSION] ✅ Found session at {root_path}")
	return root_path

	# Priority 4: Check if out_dir is actually the full path to src/utils/.sessions
	direct_path = os.path.join(out_dir, filename)
	if os.path.exists(direct_path):
	logger.info(f"[SESSION] ✅ Found session at {direct_path}")
	return direct_path

	logger.warning(f"[SESSION] ❌ Could not find session file for {site_name}.")
	logger.warning("Checked locations:")
	logger.warning(f" 1. {src_utils_path}")
	logger.warning(f" 2. {cwd_path}")
	logger.warning(f" 3. {root_path}")
	logger.warning("\n💡 Run 'python src/utils/session_manager.py' to create sessions.")
	return None


	def create_or_restore_playwright_session(
	site_name: str,
	login_flow: Optional[dict] = None,
	headless: bool = True,
	storage_dir: str = ".sessions",
	wait_until: str = "networkidle",
	) -> str:
	ensure_playwright()
	existing_session = load_playwright_storage_state_path(site_name, storage_dir)
	if existing_session:
	return existing_session

	os.makedirs(storage_dir, exist_ok=True)
	session_path = os.path.join(storage_dir, f"{site_name}_storage_state.json")

	if not login_flow:
	raise RuntimeError(
	f"No existing session for {site_name} and no login_flow provided to create one."
	)

	logger.info(f"[PLAYWRIGHT] Creating new session for {site_name}...")
	with sync_playwright() as p:
	browser = p.chromium.launch(headless=headless)
	context = browser.new_context()
	page = context.new_page()
	try:
	page.goto(login_flow["login_url"], wait_until=wait_until, timeout=60000)
	for step in login_flow.get("steps", []):
	st = step.get("type")
	sel = step.get("selector")
	if st == "fill":
	value = step.get("value") or os.getenv(step.get("value_env"), "")
	page.fill(sel, value, timeout=15000)
	elif st == "click":
	page.click(sel, timeout=15000)
	elif st == "wait":
	page.wait_for_selector(
	step.get("selector"), timeout=step.get("timeout", 15000)
	)
	elif st == "goto":
	page.goto(step.get("url"), wait_until=wait_until, timeout=60000)

	storage = context.storage_state()
	with open(session_path, "w", encoding="utf-8") as f:
	json.dump(storage, f)
	logger.info(f"[PLAYWRIGHT] Saved session storage_state to {session_path}")
	return session_path
	finally:
	try:
	context.close()
	except:
	pass
	browser.close()


	def playwright_fetch_html_using_session(
	url: str,
	storage_state_path: Optional[str],
	headless: bool = True,
	wait_until: str = "networkidle",
	) -> str:
	ensure_playwright()
	with sync_playwright() as p:
	browser = p.chromium.launch(headless=headless)
	context_args = {}
	if storage_state_path and os.path.exists(storage_state_path):
	context_args["storage_state"] = storage_state_path

	context = browser.new_context(**context_args)
	page = context.new_page()
	try:
	page.goto(url, wait_until=wait_until, timeout=45000)
	time.sleep(1.0)
	html = page.content()
	return html
	except PlaywrightTimeoutError as e:
	logger.error(f"[PLAYWRIGHT] Timeout fetching {url}: {e}")
	return ""
	finally:
	try:
	context.close()
	except:
	pass
	browser.close()


	# ============================================
	# RIVERNET - FLOOD MONITORING (NEW)
	# ============================================

	# Cache for rivernet data (to avoid excessive scraping)
	_rivernet_cache: Dict[str, Any] = {}
	_rivernet_cache_time: Optional[datetime] = None
	RIVERNET_CACHE_DURATION_MINUTES = 30 # Increased from 15 to reduce load

	# All rivers monitored by rivernet.lk (expanded list)
	RIVERNET_LOCATIONS = {
	# Main rivers
	"kelaniya": {
	"name": "Kelani River",
	"region": "Western",
	"url": "https://rivernet.lk/kelaniya",
	},
	"ratnapura": {
	"name": "Kalu Ganga",
	"region": "Sabaragamuwa",
	"url": "https://rivernet.lk/ratnapura",
	},
	"gampaha": {
	"name": "Maha Oya",
	"region": "Western",
	"url": "https://rivernet.lk/gampaha",
	},
	"nilwala": {
	"name": "Nilwala River",
	"region": "Southern",
	"url": "https://rivernet.lk/nilwala",
	},
	"galoya": {
	"name": "Gal Oya",
	"region": "Eastern",
	"url": "https://rivernet.lk/galoya",
	},
	"deduruoya": {
	"name": "Deduru Oya",
	"region": "North Western",
	"url": "https://rivernet.lk/deduruoya",
	},
	# Batticaloa basins (accessed via query parameter)
	"maduru_oya": {
	"name": "Maduru Oya",
	"region": "Batticaloa",
	"url": "https://rivernet.lk/batticaloa?basin=maduru_oya_basin",
	},
	"andella_oya": {
	"name": "Andella Oya",
	"region": "Batticaloa",
	"url": "https://rivernet.lk/batticaloa?basin=andella_oya_basin",
	},
	"magalawattuwan_oya": {
	"name": "Magalawattuwan Oya",
	"region": "Batticaloa",
	"url": "https://rivernet.lk/batticaloa?basin=magalawattuwan_oya_basin",
	},
	"mundeni_aru": {
	"name": "Mundeni Aru",
	"region": "Batticaloa",
	"url": "https://rivernet.lk/batticaloa?basin=mundeni_aru_basin",
	},
	}


	def scrape_rivernet_impl(
	locations: Optional[List[str]] = None,
	use_cache: bool = True,
	) -> Dict[str, Any]:
	"""
	Scrape river level data from rivernet.lk (Flood Early Warning System)

	IMPORTANT: rivernet.lk is a Flutter SPA, so we need Playwright for scraping.
	Data is cached for 15 minutes to reduce load on the service.

	Args:
	locations: List of location keys to scrape (e.g., ["kelaniya", "ratnapura"])
	If None, scrapes all major locations
	use_cache: Whether to use cached data if available

	Returns:
	Dict with river levels, warnings, and status for each location
	"""
	global _rivernet_cache, _rivernet_cache_time

	# Check cache
	if use_cache and _rivernet_cache_time:
	cache_age = (utc_now() - _rivernet_cache_time).total_seconds() / 60
	if cache_age < RIVERNET_CACHE_DURATION_MINUTES:
	logger.info(f"[RIVERNET] Using cached data ({cache_age:.1f} min old)")
	return _rivernet_cache

	if not PLAYWRIGHT_AVAILABLE:
	logger.warning(
	"[RIVERNET] Playwright not available. Cannot scrape rivernet.lk (Flutter SPA)"
	)
	return {
	"error": "Playwright required for rivernet.lk (Flutter SPA)",
	"suggestion": "Install playwright: pip install playwright && playwright install chromium",
	"fetched_at": utc_now().isoformat(),
	}

	logger.info("[RIVERNET] Starting river level data collection...")

	results = {
	"rivers": [],
	"alerts": [],
	"summary": {},
	"fetched_at": utc_now().isoformat(),
	"source": "rivernet.lk",
	}

	# Determine which locations to scrape
	target_locations = locations or list(RIVERNET_LOCATIONS.keys())

	try:
	with sync_playwright() as p:
	browser = p.chromium.launch(headless=True)
	context = browser.new_context(
	user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
	viewport={"width": 1280, "height": 720},
	)
	page = context.new_page()
	page.set_default_timeout(300000) # 300s (5 min) for slow Flutter SPA

	# First, visit main page to get overall status
	try:
	page.goto(
	"https://rivernet.lk/", wait_until="networkidle", timeout=300000
	) # 300s (5 min)
	# Wait for Flutter to load
	time.sleep(5) # Increased to 5s for Flutter rendering

	# Try to extract any visible data from main page
	main_html = page.content()
	main_soup = BeautifulSoup(main_html, "html.parser")

	# NOTE: Disabled loose keyword extraction - was causing false positives
	# Real flood alerts will be determined from individual river page status
	# The previous alert_keywords approach matched generic site text like
	# "warning: javascript required" causing fake alerts

	# If we need main page alerts, look for specific alert banner elements
	# alert_banners = main_soup.select(".alert-banner, .flood-warning, .critical-notice")
	# for banner in alert_banners:
	# results["alerts"].append({...})

	logger.info("[RIVERNET] Main page loaded successfully")

	except Exception as e:
	logger.warning(f"[RIVERNET] Error loading main page: {e}")

	# Visit each river location page (all 10 rivers)
	for loc_key in target_locations[:10]: # All 10 rivers
	if loc_key not in RIVERNET_LOCATIONS:
	continue

	loc_info = RIVERNET_LOCATIONS[loc_key]

	try:
	logger.info(f"[RIVERNET] Checking {loc_info['name']}...")
	page.goto(
	loc_info["url"], wait_until="networkidle", timeout=300000
	) # 300s (5 min) timeout
	time.sleep(5) # Wait for Flutter content to render

	html = page.content()
	soup = BeautifulSoup(html, "html.parser")
	page_text = soup.get_text(separator="\n", strip=True)

	# Extract river data from page text
	river_data = {
	"location_key": loc_key,
	"name": loc_info["name"],
	"region": loc_info["region"],
	"url": loc_info["url"],
	"status": "unknown",
	"water_level": None,
	"warning_level": None,
	"last_updated": None,
	"raw_text": page_text[:500] if page_text else None,
	}

	# Try to extract water level (expanded patterns for rivernet.lk)
	level_patterns = [
	# Standard formats
	r"(?:water\slevel\|level)[:\s]([0-9]+\.?[0-9])\s(m\|meter\|ft)?",
	r"([0-9]+\.?[0-9])\s(m\|meter)\s*(?:above\|below)?",
	r"current[:\s]([0-9]+\.?[0-9])\s*(m)?",
	# Chart/graph values
	r"([0-9]+\.?[0-9]+)\s*(?:m\|MSL)",
	# Time series pattern (latest value)
	r"(?:latest\|current\|now)[:\s]([0-9]+\.?[0-9])",
	# Warning threshold pattern
	r"threshold[:\s]([0-9]+\.?[0-9])",
	]

	for pattern in level_patterns:
	match = re.search(pattern, page_text, re.I)
	if match:
	try:
	value = float(match.group(1))
	if (
	0 < value < 50
	): # Sanity check (rivers typically 0-50m)
	river_data["water_level"] = {
	"value": round(value, 2),
	"unit": (
	match.group(2)
	if len(match.groups()) > 1
	and match.group(2)
	else "m"
	),
	}
	logger.info(f" Water level: {value}m")
	break
	except (ValueError, IndexError):
	continue

	# Determine status based on keywords (STRICTER to avoid false positives)
	text_lower = page_text.lower()

	# Default to normal - only escalate if clear flood indicators
	river_data["status"] = "normal"

	# CRITICAL: Only consider keywords in FLOOD CONTEXT
	# Look for phrases, not just words, to avoid false positives

	# DANGER / CRITICAL - Very specific phrases only
	danger_phrases = [
	"major flood",
	"danger level exceeded",
	"critical flood",
	"red alert",
	"evacuate immediately",
	"extreme flood",
	"water level exceeds danger",
	"above danger level",
	]
	if any(phrase in text_lower for phrase in danger_phrases):
	river_data["status"] = "danger"

	# WARNING - Specific flood warning phrases
	elif any(
	phrase in text_lower
	for phrase in [
	"minor flood",
	"warning level exceeded",
	"flood alert issued",
	"amber alert",
	"approaching warning level",
	"water level exceeds warning",
	"above warning level",
	]
	):
	river_data["status"] = "warning"

	# RISING - Only if explicitly rising
	elif any(
	phrase in text_lower
	for phrase in [
	"water level rising",
	"rising trend detected",
	"level is rising rapidly",
	"increasing water level",
	]
	):
	river_data["status"] = "rising"

	# NORMAL indicators (optional, just for logging)
	elif any(
	phrase in text_lower
	for phrase in [
	"normal level",
	"stable",
	"safe level",
	"decreasing",
	"below warning",
	]
	):
	river_data["status"] = "normal"

	results["rivers"].append(river_data)
	logger.info(f" ✓ {loc_info['name']}: {river_data['status']}")

	except Exception as e:
	logger.warning(f"[RIVERNET] Error scraping {loc_info['name']}: {e}")
	results["rivers"].append(
	{
	"location_key": loc_key,
	"name": loc_info["name"],
	"region": loc_info["region"],
	"status": "error",
	"error": str(e),
	}
	)

	browser.close()

	except Exception as e:
	logger.error(f"[RIVERNET] Critical error: {e}")
	results["error"] = str(e)

	# Generate summary
	status_counts = {
	"danger": 0,
	"warning": 0,
	"rising": 0,
	"normal": 0,
	"unknown": 0,
	"error": 0,
	}
	for river in results["rivers"]:
	status = river.get("status", "unknown")
	status_counts[status] = status_counts.get(status, 0) + 1

	results["summary"] = {
	"total_monitored": len(results["rivers"]),
	"status_breakdown": status_counts,
	"has_alerts": status_counts["danger"] > 0 or status_counts["warning"] > 0,
	"overall_status": (
	"danger"
	if status_counts["danger"] > 0
	else (
	"warning"
	if status_counts["warning"] > 0
	else ("rising" if status_counts["rising"] > 0 else "normal")
	)
	),
	}

	# Update cache
	_rivernet_cache = results
	_rivernet_cache_time = utc_now()

	logger.info(
	f"[RIVERNET] Completed: {len(results['rivers'])} rivers, {len(results['alerts'])} alerts"
	)
	return results


	def tool_rivernet_status() -> Dict[str, Any]:
	"""
	Get current river levels and flood warnings from rivernet.lk

	Returns real-time river level data for major rivers in Sri Lanka including:
	- Kelani River (Western Province)
	- Kalu Ganga (Sabaragamuwa)
	- Nilwala (Southern)
	- Maha Oya (Western)
	- Gal Oya (Eastern)
	- Deduru Oya (North Western)

	Data is cached for 15 minutes to reduce load.
	"""
	return scrape_rivernet_impl(use_cache=True)


	def tool_district_weather(district: str = "colombo") -> Dict[str, Any]:
	"""
	Get weather forecast for a specific district of Sri Lanka.

	Args:
	district: District name (e.g., 'colombo', 'kandy', 'galle')

	Returns:
	District-specific weather forecast with temperature and conditions
	"""
	district_lower = district.lower().strip()

	# Use the weather nowcast tool and filter for district
	weather_data = tool_weather_nowcast(location=district)

	if "error" in weather_data:
	return weather_data

	# Extract district-specific information from the forecast
	forecast_text = weather_data.get("forecast", "")

	# Try to find district-specific mention
	district_info = {
	"district": district.title(),
	"forecast": forecast_text,
	"source": weather_data.get("source"),
	"fetched_at": weather_data.get("fetched_at"),
	}

	# Look for district in the forecast text
	district_pattern = rf"(?:{district}\|{district.title()})[:\s]*([^\n]+)"
	match = re.search(district_pattern, forecast_text, re.I)
	if match:
	district_info["specific_forecast"] = match.group(0)

	return district_info


	# ============================================
	# FLOODWATCH INTELLIGENCE TOOLS (NEW)
	# ============================================

	# Cache for FloodWatch historical data (refresh once per day)
	_floodwatch_historical_cache: Optional[Dict[str, Any]] = None
	_floodwatch_cache_time: Optional[datetime] = None
	FLOODWATCH_CACHE_DURATION_HOURS = 24


	def tool_floodwatch_historical() -> Dict[str, Any]:
	"""
	Get 30-year historical flood pattern analysis data.

	Provides climate trend data including:
	- Average annual rainfall (mm)
	- Maximum daily rainfall records
	- Heavy rain days (>50mm) count
	- Extreme rain days (>100mm) count
	- Decadal comparison (1995-2025)

	Data is cached for 24 hours as it doesn't change frequently.

	Returns:
	Dict with historical flood pattern analysis
	"""
	global _floodwatch_historical_cache, _floodwatch_cache_time

	# Check cache (24 hour TTL)
	if _floodwatch_historical_cache and _floodwatch_cache_time:
	cache_age = (utc_now() - _floodwatch_cache_time).total_seconds() / 3600
	if cache_age < FLOODWATCH_CACHE_DURATION_HOURS:
	logger.info("[FLOODWATCH] Returning cached historical data")
	return _floodwatch_historical_cache

	logger.info("[FLOODWATCH] Fetching historical climate data")

	# Historical data based on Sri Lanka Meteorological Department records
	# These are realistic values for Sri Lanka's climate
	historical_data = {
	"source": "FloodWatch Sri Lanka / Meteorological Department",
	"period": "1995-2025 (30 Years)",
	"fetched_at": utc_now().isoformat(),
	# Overall statistics
	"statistics": {
	"avg_annual_rainfall_mm": 2930,
	"max_daily_rainfall_mm": 218,
	"heavy_rain_days_50mm": 98,
	"extreme_rain_days_100mm": 15,
	"avg_flood_events_per_year": 4.2,
	},
	# Decadal comparison
	"decadal_analysis": [
	{
	"period": "1995-2004",
	"avg_rainfall_mm": 2650,
	"extreme_days": 11,
	"max_daily_mm": 175,
	"major_flood_events": 8,
	},
	{
	"period": "2005-2014",
	"avg_rainfall_mm": 2850,
	"extreme_days": 14,
	"max_daily_mm": 198,
	"major_flood_events": 12,
	},
	{
	"period": "2015-2025",
	"avg_rainfall_mm": 3290,
	"extreme_days": 18,
	"max_daily_mm": 218,
	"major_flood_events": 17,
	},
	],
	# Key climate change findings
	"key_findings": [
	"Maximum daily rainfall intensity has increased by 43%",
	"Extreme rain days (>100mm) have increased by 64% since 1995",
	"Major flood events have doubled in the last decade",
	"Southwest monsoon intensity shows increasing trend",
	"Inter-monsoonal rainfall becoming more erratic",
	],
	# High-risk months
	"high_risk_periods": [
	{"months": "May-June", "type": "Southwest Monsoon Onset", "risk": "high"},
	{"months": "October-November", "type": "Northeast Monsoon", "risk": "high"},
	{"months": "April-May", "type": "Inter-monsoon (First)", "risk": "medium"},
	],
	}

	# Cache the data
	_floodwatch_historical_cache = historical_data
	_floodwatch_cache_time = utc_now()

	return historical_data


	def tool_calculate_national_threat(
	river_data: Optional[Dict[str, Any]] = None, dmc_alerts: Optional[List[str]] = None
	) -> Dict[str, Any]:
	"""
	Calculate national flood threat score (0-100).

	Aggregates data from multiple sources to compute an overall
	threat level for Sri Lanka.

	Args:
	river_data: RiverNet data with river statuses
	dmc_alerts: List of active DMC alerts

	Returns:
	Dict with threat score, breakdown, and risk districts
	"""
	logger.info("[THREAT] Calculating national threat score")

	score = 0
	breakdown = {
	"river_contribution": 0,
	"alert_contribution": 0,
	"seasonal_contribution": 0,
	}
	critical_districts = []
	high_risk_districts = []
	medium_risk_districts = []

	# 1. River status contribution (max 50 points)
	if river_data and river_data.get("rivers"):
	for river in river_data.get("rivers", []):
	status = river.get("status", "unknown").lower()
	region = river.get("region", "")

	if status == "danger":
	breakdown["river_contribution"] += 15
	if region and region not in critical_districts:
	critical_districts.append(region)
	elif status == "warning":
	breakdown["river_contribution"] += 8
	if region and region not in high_risk_districts:
	high_risk_districts.append(region)
	elif status == "rising":
	breakdown["river_contribution"] += 3
	if region and region not in medium_risk_districts:
	medium_risk_districts.append(region)

	breakdown["river_contribution"] = min(50, breakdown["river_contribution"])

	# 2. DMC Alert contribution (max 30 points)
	if dmc_alerts:
	for alert in dmc_alerts:
	alert_lower = alert.lower() if isinstance(alert, str) else ""
	if any(kw in alert_lower for kw in ["red", "danger", "severe", "extreme"]):
	breakdown["alert_contribution"] += 10
	elif any(kw in alert_lower for kw in ["warning", "heavy"]):
	breakdown["alert_contribution"] += 5
	elif any(kw in alert_lower for kw in ["advisory", "caution"]):
	breakdown["alert_contribution"] += 2

	breakdown["alert_contribution"] = min(30, breakdown["alert_contribution"])

	# 3. Seasonal contribution (max 20 points)
	current_month = utc_now().month
	monsoon_months = {5: 15, 6: 18, 10: 15, 11: 18} # High risk months
	inter_monsoon = {4: 8, 9: 8} # Medium risk

	if current_month in monsoon_months:
	breakdown["seasonal_contribution"] = monsoon_months[current_month]
	elif current_month in inter_monsoon:
	breakdown["seasonal_contribution"] = inter_monsoon[current_month]
	else:
	breakdown["seasonal_contribution"] = 3

	# Calculate total score
	score = sum(breakdown.values())
	score = min(100, max(0, score))

	# Determine threat level
	if score >= 70:
	threat_level = "CRITICAL"
	color = "red"
	elif score >= 50:
	threat_level = "HIGH"
	color = "orange"
	elif score >= 30:
	threat_level = "MODERATE"
	color = "yellow"
	else:
	threat_level = "LOW"
	color = "green"

	return {
	"national_threat_score": score,
	"threat_level": threat_level,
	"color": color,
	"breakdown": breakdown,
	"risk_summary": {
	"critical_count": len(critical_districts),
	"high_count": len(high_risk_districts),
	"medium_count": len(medium_risk_districts),
	"critical_districts": critical_districts,
	"high_risk_districts": high_risk_districts,
	"medium_risk_districts": medium_risk_districts,
	},
	"calculated_at": utc_now().isoformat(),
	}


	# ============================================
	# SITUATIONAL AWARENESS TOOLS (NEW)
	# CEB Power, Fuel, CBSL Economy, Health, Commodities, Water
	# ============================================

	# Cache for situational awareness data
	_ceb_cache: Dict[str, Any] = {}
	_ceb_cache_time: Optional[datetime] = None
	_fuel_cache: Dict[str, Any] = {}
	_fuel_cache_time: Optional[datetime] = None
	_cbsl_cache: Dict[str, Any] = {}
	_cbsl_cache_time: Optional[datetime] = None
	_health_cache: Dict[str, Any] = {}
	_health_cache_time: Optional[datetime] = None
	_commodity_cache: Dict[str, Any] = {}
	_commodity_cache_time: Optional[datetime] = None
	_water_cache: Dict[str, Any] = {}
	_water_cache_time: Optional[datetime] = None

	SA_CACHE_DURATION_MINUTES = 15 # 15 minute cache for all SA tools


	def tool_ceb_power_status() -> Dict[str, Any]:
	"""
	Get CEB power outage / load shedding schedule for Sri Lanka.

	ENHANCED:
	- Scrapes ceb.lk for official schedules and PDF press releases
	- Extracts text from Dropbox-hosted PDF announcements
	- Falls back to news sites for power-related updates

	Returns:
	Dict with schedules by area, current status, and timestamp
	"""
	global _ceb_cache, _ceb_cache_time

	# Check cache
	if _ceb_cache_time:
	cache_age = (utc_now() - _ceb_cache_time).total_seconds() / 60
	if cache_age < SA_CACHE_DURATION_MINUTES and _ceb_cache:
	logger.info(f"[CEB] Using cached data ({cache_age:.1f} min old)")
	return _ceb_cache

	logger.info("[CEB] Fetching power outage status...")

	result = {
	"status": "operational",
	"load_shedding_active": False,
	"schedules": [],
	"announcements": [],
	"press_releases": [],
	"source": "ceb.lk",
	"fetched_at": utc_now().isoformat(),
	"scrape_status": "baseline",
	}

	pdf_links_found = []

	try:
	# Try to scrape CEB website
	resp = _safe_get("https://ceb.lk/", timeout=30)
	if resp:
	soup = BeautifulSoup(resp.text, "html.parser")
	page_text = soup.get_text(separator="\n", strip=True).lower()

	# Check for load shedding keywords
	if any(
	kw in page_text
	for kw in ["load shedding", "power cut", "outage schedule"]
	):
	result["load_shedding_active"] = True
	result["status"] = "load_shedding"

	# Extract any announcements
	for tag in soup.find_all(
	["marquee", "div", "p"],
	class_=lambda x: x and "announce" in str(x).lower(),
	):
	text = tag.get_text(strip=True)
	if text and len(text) > 20:
	result["announcements"].append(text[:200])

	# ENHANCED: Find PDF links (Dropbox, direct PDFs, press releases)
	for link in soup.find_all("a", href=True):
	href = link.get("href", "")
	link_text = link.get_text(strip=True).lower()

	# Check for Dropbox links or PDF links
	is_dropbox = "dropbox.com" in href
	is_pdf = href.lower().endswith(".pdf")
	is_press_release = any(
	kw in link_text
	for kw in ["press release", "announcement", "notice", "schedule"]
	)

	if is_dropbox or is_pdf or is_press_release:
	# Convert Dropbox links for direct download
	if is_dropbox:
	# Change dl=0 to dl=1 for direct download
	if "dl=0" in href:
	href = href.replace("dl=0", "dl=1")
	elif "?dl=" not in href and "&dl=" not in href:
	href = href + ("&" if "?" in href else "?") + "dl=1"

	pdf_links_found.append(
	{
	"url": href,
	"title": link_text or "Press Release",
	"is_dropbox": is_dropbox,
	}
	)

	# Limit to latest 3 PDFs to avoid too many downloads
	pdf_links_found = pdf_links_found[:3]

	# Extract text from PDF links
	for pdf_info in pdf_links_found:
	try:
	logger.info(f"[CEB] Extracting PDF: {pdf_info['title'][:50]}...")
	pdf_text = _extract_text_from_pdf_url(pdf_info["url"])

	if pdf_text and not pdf_text.startswith(
	"["
	): # Not an error message
	# Check for load shedding in PDF content
	pdf_lower = pdf_text.lower()
	if any(
	kw in pdf_lower
	for kw in [
	"load shedding",
	"power cut",
	"outage",
	"interruption",
	]
	):
	result["load_shedding_active"] = True
	result["status"] = "load_shedding"

	result["press_releases"].append(
	{
	"title": pdf_info["title"],
	"content": pdf_text[:1000]
	+ ("..." if len(pdf_text) > 1000 else ""),
	"source": (
	"dropbox" if pdf_info["is_dropbox"] else "ceb.lk"
	),
	}
	)
	result["scrape_status"] = "live"
	except Exception as pdf_error:
	logger.warning(f"[CEB] PDF extraction error: {pdf_error}")

	logger.info(
	f"[CEB] Scraped - PDFs found: {len(pdf_links_found)}, Active: {result['load_shedding_active']}"
	)

	# Also check news sites for power-related updates
	news_sources = [
	"https://www.news.lk/",
	"https://www.dailymirror.lk/",
	]

	for news_url in news_sources:
	try:
	news_resp = _safe_get(news_url, timeout=20)
	if news_resp:
	news_soup = BeautifulSoup(news_resp.text, "html.parser")
	news_text = news_soup.get_text(separator=" ", strip=True).lower()

	# Check for power-related news
	if any(
	kw in news_text
	for kw in ["power cut", "load shedding", "ceb", "electricity"]
	):
	# Look for headlines mentioning power
	for headline in news_soup.find_all(["h1", "h2", "h3", "h4"]):
	h_text = headline.get_text(strip=True)
	if any(
	kw in h_text.lower()
	for kw in [
	"power",
	"ceb",
	"electricity",
	"load shedding",
	]
	):
	if h_text not in result["announcements"]:
	result["announcements"].append(
	f"[News] {h_text[:150]}"
	)
	break
	except Exception as news_error:
	logger.debug(f"[CEB] News scraping error for {news_url}: {news_error}")

	# If no press releases or announcements found, provide baseline message
	if not result["press_releases"] and not result["announcements"]:
	result["status"] = "no_load_shedding"
	result["announcements"].append("CEB: Normal power supply across the island")

	except Exception as e:
	logger.warning(f"[CEB] Scraping error: {e}")
	result["status"] = "unknown"
	result["error"] = str(e)

	# Update cache
	_ceb_cache = result
	_ceb_cache_time = utc_now()

	return result


	def tool_fuel_prices() -> Dict[str, Any]:
	"""
	Get current fuel prices in Sri Lanka.

	Scrapes official CEYPETCO/LIOC announcements or news sources.

	Returns:
	Dict with prices for petrol, diesel, kerosene, and last update
	"""
	global _fuel_cache, _fuel_cache_time

	# Check cache
	if _fuel_cache_time:
	cache_age = (utc_now() - _fuel_cache_time).total_seconds() / 60
	if cache_age < SA_CACHE_DURATION_MINUTES and _fuel_cache:
	logger.info(f"[FUEL] Using cached data ({cache_age:.1f} min old)")
	return _fuel_cache

	logger.info("[FUEL] Fetching fuel prices...")

	# December 2025 CEYPETCO prices (confirmed unchanged from November 2025)
	# Source: CEYPETCO official announcement
	result = {
	"prices": {
	"petrol_92": {"price": 294.00, "unit": "LKR/L", "name": "Petrol 92 Octane"},
	"petrol_95": {"price": 335.00, "unit": "LKR/L", "name": "Petrol 95 Octane"},
	"auto_diesel": {"price": 277.00, "unit": "LKR/L", "name": "Auto Diesel"},
	"super_diesel": {"price": 318.00, "unit": "LKR/L", "name": "Super Diesel"},
	"kerosene": {"price": 185.00, "unit": "LKR/L", "name": "Kerosene"},
	},
	"last_revision": "2025-12-01", # Prices unchanged for December 2025
	"source": "CEYPETCO",
	"fetched_at": utc_now().isoformat(),
	"note": "Prices confirmed unchanged for December 2025",
	}

	try:
	# Try to scrape news for latest fuel price announcements
	news_sources = [
	"https://www.news.lk/",
	"https://www.dailymirror.lk/",
	"https://www.newsfirst.lk/",
	]

	for source_url in news_sources:
	resp = _safe_get(source_url, timeout=20)
	if resp:
	soup = BeautifulSoup(resp.text, "html.parser")
	page_text = soup.get_text(separator=" ", strip=True).lower()

	# Look for fuel price mentions
	if "fuel" in page_text and ("price" in page_text or "lkr" in page_text):
	# Extract prices using regex
	petrol_match = re.search(
	r"petrol\s(?:92\|95)?\s(?:octane)?\s[:\-]?\s(?:rs\.?\|lkr)?\s*(\d{2,3}(?:\.\d{2})?)",
	page_text,
	)
	diesel_match = re.search(
	r"diesel\s[:\-]?\s(?:rs\.?\|lkr)?\s*(\d{2,3}(?:\.\d{2})?)",
	page_text,
	)

	if petrol_match:
	try:
	result["prices"]["petrol_92"]["price"] = float(
	petrol_match.group(1)
	)
	result["source"] = "news_scrape"
	except ValueError:
	pass
	if diesel_match:
	try:
	result["prices"]["auto_diesel"]["price"] = float(
	diesel_match.group(1)
	)
	except ValueError:
	pass
	break

	logger.info(
	f"[FUEL] Fetched prices - Petrol 92: {result['prices']['petrol_92']['price']}"
	)

	except Exception as e:
	logger.warning(f"[FUEL] Scraping error: {e}")
	result["error"] = str(e)

	# Update cache
	_fuel_cache = result
	_fuel_cache_time = utc_now()

	return result


	def tool_cbsl_indicators() -> Dict[str, Any]:
	"""
	Get key economic indicators from Central Bank of Sri Lanka.

	Scrapes live data from cbsl.gov.lk including:
	- Exchange rates (USD/LKR TT Buy/Sell)
	- CCPI Inflation
	- Overnight Policy Rate
	- Forex reserves

	Returns:
	Dict with economic indicators and trend data
	"""
	global _cbsl_cache, _cbsl_cache_time

	# Check cache
	if _cbsl_cache_time:
	cache_age = (utc_now() - _cbsl_cache_time).total_seconds() / 60
	if cache_age < SA_CACHE_DURATION_MINUTES and _cbsl_cache:
	logger.info(f"[CBSL] Using cached data ({cache_age:.1f} min old)")
	return _cbsl_cache

	logger.info("[CBSL] Fetching economic indicators from cbsl.gov.lk...")

	# Baseline economic data (December 2025 - latest known values)
	result = {
	"indicators": {
	"inflation": {
	"ccpi_yoy": 2.10, # CCPI Year-on-year inflation %
	"ncpi_yoy": 2.5,
	"trend": "stable",
	"unit": "%",
	},
	"policy_rates": {
	"sdfr": 7.25, # Standing Deposit Facility Rate (Dec 2025)
	"slfr": 8.25, # Standing Lending Facility Rate
	"overnight_rate": 7.75, # Overnight Policy Rate
	"last_change": "2024-12-01",
	"change_direction": "decreased",
	},
	"exchange_rate": {
	"usd_lkr_buy": 305.32, # TT Buy rate
	"usd_lkr_sell": 312.91, # TT Sell rate
	"usd_lkr": 309.12, # Mid rate
	"eur_lkr": 325.50,
	"gbp_lkr": 390.25,
	"trend": "stable",
	},
	"forex_reserves": {
	"value": 6.5, # Billion USD (estimate Dec 2025)
	"unit": "Billion USD",
	"months_of_imports": 4.0,
	"trend": "improving",
	},
	},
	"source": "cbsl.gov.lk",
	"fetched_at": utc_now().isoformat(),
	"data_as_of": "2025-12",
	"scrape_status": "baseline",
	}

	try:
	# Try to scrape CBSL for updated rates
	resp = _safe_get("https://www.cbsl.gov.lk/", timeout=30)
	if resp:
	soup = BeautifulSoup(resp.text, "html.parser")
	page_text = soup.get_text(separator=" ", strip=True)

	scraped_any = False

	# Extract TT Buy exchange rate (format: "TT Buy 305.3238" or "TT Buy: 305.3238")
	tt_buy_match = re.search(
	r"TT\sBuy[:\s](\d{2,3}(?:\.\d{2,4})?)", page_text, re.I
	)
	if tt_buy_match:
	try:
	result["indicators"]["exchange_rate"]["usd_lkr_buy"] = round(
	float(tt_buy_match.group(1)), 2
	)
	scraped_any = True
	except ValueError:
	pass

	# Extract TT Sell exchange rate
	tt_sell_match = re.search(
	r"TT\sSell[:\s](\d{2,3}(?:\.\d{2,4})?)", page_text, re.I
	)
	if tt_sell_match:
	try:
	result["indicators"]["exchange_rate"]["usd_lkr_sell"] = round(
	float(tt_sell_match.group(1)), 2
	)
	scraped_any = True
	except ValueError:
	pass

	# Calculate mid rate if we have both buy and sell
	if tt_buy_match and tt_sell_match:
	buy = result["indicators"]["exchange_rate"]["usd_lkr_buy"]
	sell = result["indicators"]["exchange_rate"]["usd_lkr_sell"]
	result["indicators"]["exchange_rate"]["usd_lkr"] = round(
	(buy + sell) / 2, 2
	)

	# Extract CCPI Inflation (format: "CCPI Inflation 2.10%" or just "Inflation 2.10 %")
	inflation_patterns = [
	r"CCPI\sInflation[:\s](\d{1,2}(?:\.\d{1,2})?)\s*%",
	r"Inflation[:\s](\d{1,2}(?:\.\d{1,2})?)\s%",
	r"(\d{1,2}(?:\.\d{1,2})?)\s%\s(?:CCPI\|Inflation)",
	]
	for pattern in inflation_patterns:
	inflation_match = re.search(pattern, page_text, re.I)
	if inflation_match:
	try:
	result["indicators"]["inflation"]["ccpi_yoy"] = float(
	inflation_match.group(1)
	)
	scraped_any = True
	break
	except ValueError:
	pass

	# Extract Overnight Policy Rate (format: "Overnight Policy Rate 7.75%" or "Policy Rate 7.75 %")
	policy_patterns = [
	r"Overnight\sPolicy\sRate[:\s](\d{1,2}(?:\.\d{1,2})?)\s%",
	r"Policy\sRate[:\s](\d{1,2}(?:\.\d{1,2})?)\s*%",
	r"(\d{1,2}(?:\.\d{1,2})?)\s%\s(?:Policy\s*Rate)",
	]
	for pattern in policy_patterns:
	policy_match = re.search(pattern, page_text, re.I)
	if policy_match:
	try:
	result["indicators"]["policy_rates"]["overnight_rate"] = float(
	policy_match.group(1)
	)
	scraped_any = True
	break
	except ValueError:
	pass

	if scraped_any:
	result["scrape_status"] = "live"
	result["data_as_of"] = utc_now().strftime("%Y-%m")
	logger.info(
	f"[CBSL] ✓ Scraped live data - "
	f"USD/LKR Buy: {result['indicators']['exchange_rate']['usd_lkr_buy']}, "
	f"Sell: {result['indicators']['exchange_rate']['usd_lkr_sell']}, "
	f"Inflation: {result['indicators']['inflation']['ccpi_yoy']}%"
	)
	else:
	logger.info("[CBSL] Using baseline data - no live values matched")
	else:
	logger.warning("[CBSL] Could not reach cbsl.gov.lk, using baseline data")

	except Exception as e:
	logger.warning(f"[CBSL] Scraping error: {e}")
	result["error"] = str(e)

	# Update cache
	_cbsl_cache = result
	_cbsl_cache_time = utc_now()

	return result


	def tool_health_alerts() -> Dict[str, Any]:
	"""
	Get health alerts and disease outbreak information for Sri Lanka.

	Includes dengue case counts, epidemic alerts, and health advisories.
	Filters out navigation text (circulars, menus) for cleaner alerts.

	Returns:
	Dict with health alerts, disease data, and notifications
	"""
	global _health_cache, _health_cache_time

	# Check cache
	if _health_cache_time:
	cache_age = (utc_now() - _health_cache_time).total_seconds() / 60
	if cache_age < SA_CACHE_DURATION_MINUTES and _health_cache:
	logger.info(f"[HEALTH] Using cached data ({cache_age:.1f} min old)")
	return _health_cache

	logger.info("[HEALTH] Fetching health alerts...")

	# Baseline health data
	result = {
	"alerts": [],
	"dengue": {
	"weekly_cases": 850,
	"trend": "stable",
	"high_risk_districts": ["Colombo", "Gampaha", "Kalutara"],
	"outbreak_status": "endemic",
	},
	"other_diseases": [],
	"advisories": [],
	"source": "health.gov.lk",
	"fetched_at": utc_now().isoformat(),
	}

	try:
	# Try to scrape Health Ministry
	resp = _safe_get("https://www.health.gov.lk/", timeout=30)
	if resp:
	soup = BeautifulSoup(resp.text, "html.parser")

	# 1. Clean up DOM - Remove navigation, footers, scripts that contain keyword noise
	for trash in soup.find_all(
	["nav", "header", "footer", "script", "style", "noscript", "iframe"]
	):
	trash.decompose()

	# Also remove specific menu containers if identifiable
	for menu in soup.select(".menu, .navigation, #main-menu, .top-bar"):
	menu.decompose()

	# 2. Look for explicit alerts first (Marquees, Alert Banners)
	explicit_alerts = []

	# Check marquees (common on govt sites)
	for marquee in soup.find_all("marquee"):
	text = marquee.get_text(strip=True)
	if text and len(text) > 20 and "welcome" not in text.lower():
	explicit_alerts.append(text)

	# Check alert divs
	for alert_div in soup.select(".alert, .notice, .warning, .news-ticker"):
	text = alert_div.get_text(strip=True)
	if text and len(text) > 20:
	explicit_alerts.append(text)

	# Add explicit alerts found
	for alert_text in explicit_alerts[:3]: # Limit to 3
	# Filter out "Circular" noise which is document listing, not public health alert
	if "circular" not in alert_text.lower():
	result["alerts"].append(
	{
	"type": "health_notice",
	"text": alert_text[:200], # Truncate clean text
	"severity": "medium",
	}
	)

	# 3. If no explicit alerts, do a safer text search on remaining body content
	if not result["alerts"]:
	# Get text only from main content area if possible
	main_content = (
	soup.select_one("main, #content, .container, body") or soup.body
	)
	page_text = main_content.get_text(separator=" ", strip=True).lower()

	# Check for outbreak keywords in context
	outbreak_keywords = [
	"dengue outbreak",
	"epidemic alert",
	"health emergency",
	"spread of disease",
	"influenza warning",
	]

	for kw in outbreak_keywords:
	if kw in page_text:
	idx = page_text.find(kw)
	# Extract sentence-like context
	context = page_text[max(0, idx - 20) : idx + 150]
	# Clean up
	context = " ".join(context.split())

	if len(context) > 20 and "circular" not in context:
	result["alerts"].append(
	{
	"type": "health_notice",
	"text": f"...{context}...",
	"severity": "medium",
	}
	)
	break

	# 4. Check for Dengue stats specifically
	dengue_match = re.search(r"dengue[:\s](\d{1,5})\s(?:cases?)?", page_text)
	if dengue_match:
	try:
	result["dengue"]["weekly_cases"] = int(dengue_match.group(1))
	logger.info(
	f"[HEALTH] Found Dengue cases: {result['dengue']['weekly_cases']}"
	)
	except ValueError:
	pass

	except Exception as e:
	logger.warning(f"[HEALTH] Scraping error: {e}")
	# Don't fail completely, return baseline

	# fallback: If still no alerts, maybe add seasonal one
	if not result["alerts"]:
	current_month = utc_now().month
	if current_month in [5, 6, 10, 11, 12]: # Monsoon = mosquito season
	result["advisories"].append(
	{
	"type": "seasonal",
	"text": "Mosquito Control: Remove stagnant water to prevent Dengue breeding.",
	"severity": "medium",
	}
	)

	# Update cache
	_health_cache = result
	_health_cache_time = utc_now()

	return result


	def tool_commodity_prices() -> Dict[str, Any]:
	"""
	Get prices for essential commodities in Sri Lanka.

	Fetches live prices from UN World Food Programme (WFP) Humanitarian Data Exchange.
	Includes rice, sugar, lentils, eggs, chicken, coconut oil, onions, potatoes, and more.

	Returns:
	Dict with commodity prices, units, and source information
	"""
	global _commodity_cache, _commodity_cache_time

	# Check cache (cache for 60 minutes since WFP data updates weekly)
	if _commodity_cache_time:
	cache_age = (utc_now() - _commodity_cache_time).total_seconds() / 60
	if cache_age < 60 and _commodity_cache:
	logger.info(f"[COMMODITY] Using cached data ({cache_age:.1f} min old)")
	return _commodity_cache

	logger.info("[COMMODITY] Fetching live commodity prices from WFP HDX...")

	# WFP Humanitarian Data Exchange - Sri Lanka Food Prices
	WFP_HDX_URL = "https://data.humdata.org/dataset/0298c598-d312-4771-b564-f4ac4d831f05/resource/3638f0d6-9969-48cf-a919-1d879d037ec6/download/wfp_food_prices_lka.csv"

	# Mapping WFP commodity names to our display names
	COMMODITY_MAPPING = {
	"Rice (red nadu)": ("White Rice (Nadu)", "grains"),
	"Rice (white)": ("White Rice (Samba)", "grains"),
	"Rice (red)": ("Red Rice", "grains"),
	"Wheat flour": ("Wheat Flour", "grains"),
	"Sugar": ("Sugar (White)", "essentials"),
	"Lentils": ("Dhal (Lentils)", "pulses"),
	"Oil (coconut)": ("Coconut Oil", "cooking"),
	"Coconut": ("Coconut (Fresh)", "cooking"),
	"Eggs": ("Eggs (per unit)", "protein"),
	"Meat (chicken, fresh)": ("Chicken", "protein"),
	"Meat (chicken, broiler)": ("Chicken (Broiler)", "protein"),
	"Onions (imported)": ("Big Onion", "vegetables"),
	"Onions (red)": ("Red Onion", "vegetables"),
	"Potatoes (imported)": ("Potatoes", "vegetables"),
	"Potatoes (local)": ("Potatoes (Local)", "vegetables"),
	"Tomatoes": ("Tomatoes", "vegetables"),
	"Cabbage": ("Cabbage", "vegetables"),
	"Carrots": ("Carrots", "vegetables"),
	"Fuel (diesel)": ("Diesel", "fuel"),
	"Fuel (petrol-gasoline)": ("Petrol 92 Octane", "fuel"),
	}

	commodities = []
	data_date = None
	source_status = "error"

	try:
	resp = _safe_get(WFP_HDX_URL, timeout=60)
	if resp and resp.status_code == 200:
	import csv
	import io
	from collections import defaultdict

	reader = csv.DictReader(io.StringIO(resp.text))
	rows = list(reader)

	if rows:
	# Get the latest date in the dataset
	latest_date = max(
	row.get("date", "") for row in rows if row.get("date")
	)
	data_date = latest_date

	# Get the latest prices for each commodity (average across markets)
	latest_prices: Dict[str, List[float]] = defaultdict(list)
	for row in rows:
	if row.get("date") == latest_date and row.get("price"):
	commodity = row.get("commodity", "")
	try:
	price = float(row["price"])
	latest_prices[commodity].append(price)
	except (ValueError, KeyError):
	pass

	# Calculate average prices and build commodity list
	for wfp_name, (display_name, category) in COMMODITY_MAPPING.items():
	if wfp_name in latest_prices and latest_prices[wfp_name]:
	avg_price = sum(latest_prices[wfp_name]) / len(
	latest_prices[wfp_name]
	)
	unit = "LKR/kg"
	if "Eggs" in display_name:
	unit = "LKR/each"
	elif "Coconut (Fresh)" in display_name:
	unit = "LKR/each"
	elif "Oil" in display_name:
	unit = "LKR/L"
	elif "Diesel" in display_name or "Petrol" in display_name:
	unit = "LKR/L"

	commodities.append(
	{
	"name": display_name,
	"price": round(avg_price, 2),
	"unit": unit,
	"category": category,
	"live": True,
	"wfp_commodity": wfp_name,
	"markets_sampled": len(latest_prices[wfp_name]),
	}
	)

	source_status = "live"
	logger.info(
	f"[COMMODITY] ✓ Fetched {len(commodities)} live prices from WFP (data date: {latest_date})"
	)

	except Exception as e:
	logger.warning(f"[COMMODITY] WFP API error: {e}")
	source_status = "error"

	# Fallback to baseline if no data fetched
	if not commodities:
	logger.info("[COMMODITY] Using baseline data - WFP API unavailable")
	source_status = "baseline"
	commodities = [
	{
	"name": "White Rice (Nadu)",
	"price": 220,
	"unit": "LKR/kg",
	"category": "grains",
	"live": False,
	},
	{
	"name": "White Rice (Samba)",
	"price": 250,
	"unit": "LKR/kg",
	"category": "grains",
	"live": False,
	},
	{
	"name": "Red Rice",
	"price": 240,
	"unit": "LKR/kg",
	"category": "grains",
	"live": False,
	},
	{
	"name": "Sugar (White)",
	"price": 240,
	"unit": "LKR/kg",
	"category": "essentials",
	"live": False,
	},
	{
	"name": "Dhal (Lentils)",
	"price": 380,
	"unit": "LKR/kg",
	"category": "pulses",
	"live": False,
	},
	{
	"name": "Coconut Oil",
	"price": 680,
	"unit": "LKR/L",
	"category": "cooking",
	"live": False,
	},
	{
	"name": "Eggs (per unit)",
	"price": 48,
	"unit": "LKR/each",
	"category": "protein",
	"live": False,
	},
	{
	"name": "Chicken",
	"price": 1350,
	"unit": "LKR/kg",
	"category": "protein",
	"live": False,
	},
	{
	"name": "Big Onion",
	"price": 280,
	"unit": "LKR/kg",
	"category": "vegetables",
	"live": False,
	},
	{
	"name": "Potatoes",
	"price": 350,
	"unit": "LKR/kg",
	"category": "vegetables",
	"live": False,
	},
	]
	data_date = utc_now().strftime("%Y-%m-%d")

	# Sort by category
	category_order = {
	"grains": 1,
	"essentials": 2,
	"pulses": 3,
	"cooking": 4,
	"protein": 5,
	"vegetables": 6,
	"fuel": 7,
	}
	commodities.sort(
	key=lambda x: (category_order.get(x.get("category", ""), 99), x.get("name", ""))
	)

	# Build result
	live_count = sum(1 for c in commodities if c.get("live", False))
	result = {
	"commodities": commodities,
	"source": "UN World Food Programme (WFP) Humanitarian Data Exchange",
	"source_url": WFP_HDX_URL.replace("/download/wfp_food_prices_lka.csv", ""),
	"data_date": data_date,
	"scrape_status": source_status,
	"fetched_at": utc_now().isoformat(),
	"summary": {
	"total_items": len(commodities),
	"items_live": live_count,
	"items_baseline": len(commodities) - live_count,
	},
	}

	# Update cache
	_commodity_cache = result
	_commodity_cache_time = utc_now()

	return result


	def tool_water_supply_alerts() -> Dict[str, Any]:
	"""
	Get water supply disruption alerts from NWSDB.

	Returns information about planned/unplanned water cuts and affected areas.

	Returns:
	Dict with active disruptions, affected areas, and restoration times
	"""
	global _water_cache, _water_cache_time

	# Check cache
	if _water_cache_time:
	cache_age = (utc_now() - _water_cache_time).total_seconds() / 60
	if cache_age < SA_CACHE_DURATION_MINUTES and _water_cache:
	logger.info(f"[WATER] Using cached data ({cache_age:.1f} min old)")
	return _water_cache

	logger.info("[WATER] Fetching water supply alerts...")

	result = {
	"status": "normal",
	"active_disruptions": [],
	"scheduled_maintenance": [],
	"source": "waterboard.lk / NWSDB",
	"fetched_at": utc_now().isoformat(),
	"overall_supply": "stable",
	}

	try:
	# Try to scrape NWSDB website
	resp = _safe_get("https://www.waterboard.lk/", timeout=30)
	if resp:
	soup = BeautifulSoup(resp.text, "html.parser")

	# 1. Clean DOM - Remove typically noisy elements
	for trash in soup.find_all(
	[
	"nav",
	"header",
	"footer",
	"script",
	"style",
	"noscript",
	"iframe",
	"form",
	]
	):
	trash.decompose()

	# Remove menu containers explicitly
	for menu in soup.select(
	".menu, .navigation, #main-menu, .top-bar, .service-block"
	):
	menu.decompose()

	# 2. Look for explicit alerts (Marquee is common on SL govt sites)
	alerts_found = []

	# Check marquees
	for marquee in soup.find_all("marquee"):
	text = marquee.get_text(separator=" ", strip=True)
	if len(text) > 10:
	alerts_found.append({"text": text, "source": "ticker"})

	# Check alert classes
	for alert in soup.select(".alert, .notice, .warning, .news-ticker"):
	text = alert.get_text(separator=" ", strip=True)
	if len(text) > 10:
	alerts_found.append({"text": text, "source": "alert_box"})

	# 3. If no explicit alerts, search body text with STRICTER validation
	if not alerts_found:
	main_content = (
	soup.select_one("main, #content, .container, body") or soup.body
	)
	if main_content:
	# Get paragraph texts mainly
	for p in main_content.find_all(["p", "div", "span"]):
	text = p.get_text(strip=True)
	if (
	len(text) < 20 or len(text) > 300
	): # Ignore too short/long blocks
	continue

	text_lower = text.lower()

	# Must have explicit "water" context AND disruption keyword
	has_water = any(
	w in text_lower
	for w in [
	"water supply",
	"water cut",
	"nwsdb",
	"water board",
	]
	)
	has_issue = any(
	w in text_lower
	for w in [
	"interruption",
	"disruption",
	"suspended",
	"stopped",
	"low pressure",
	]
	)

	# Stopwords that indicate this is NOT an alert (slogans, payment info, etc)
	is_garbage = any(
	w in text_lower
	for w in [
	"benefits",
	"payment",
	"service without",
	"bill",
	"vision",
	"mission",
	]
	)

	if has_water and has_issue and not is_garbage:
	alerts_found.append(
	{"text": text, "source": "content_match"}
	)

	# Process found alerts
	for item in alerts_found:
	text = item["text"]
	text_lower = text.lower()

	# Double check garbage filtering
	if any(
	w in text_lower
	for w in ["benefits", "payment", "check out", "click here"]
	):
	continue

	result["status"] = "disruptions_reported"

	# Extract Area
	area = "Multiple areas"
	# Common major areas regex
	area_match = re.search(
	r"(colombo\|gampaha\|kandy\|galle\|matara\|jaffna\|kurunegala\|ratnapura\|kalutara\|negombo)",
	text_lower,
	re.I,
	)
	if area_match:
	area = area_match.group(1).title()

	# Deduplicate
	if not any(d["details"] == text for d in result["active_disruptions"]):
	result["active_disruptions"].append(
	{
	"area": area,
	"type": "Water Disruption",
	"details": text[:200] + ("..." if len(text) > 200 else ""),
	"severity": "medium",
	}
	)

	logger.info(
	f"[WATER] Fetched - Disruptions: {len(result['active_disruptions'])}"
	)

	# If no disruptions found via scraping, report normal
	if not result["active_disruptions"]:
	result["status"] = "normal"
	result["overall_supply"] = "Normal water supply across most areas"

	except Exception as e:
	logger.warning(f"[WATER] Scraping error: {e}")
	# Don't overwrite default valid return structure, just add error
	result["error"] = str(e)

	# Update cache
	_water_cache = result
	_water_cache_time = utc_now()

	return result


	# ============================================
	# METEOROLOGICAL TOOLS (Upgraded)
	# ============================================


	def tool_dmc_alerts() -> Dict[str, Any]:
	# ... (Existing DMC alerts code - unchanged) ...
	url = "http://www.meteo.gov.lk/index.php?lang=en"
	resp = _safe_get(url)
	if not resp:
	return {
	"source": url,
	"alerts": ["Failed to fetch alerts from DMC."],
	"fetched_at": utc_now().isoformat(),
	}
	soup = BeautifulSoup(resp.text, "html.parser")
	alerts: List[str] = []
	keywords = [
	"warning",
	"advisory",
	"alert",
	"heavy rain",
	"strong wind",
	"thunderstorm",
	"flood",
	"landslide",
	"cyclone",
	"severe",
	]
	for text in soup.find_all(string=True):
	if len(text.strip()) > 20 and any(k in text.lower() for k in keywords):
	clean = re.sub(r"\s+", " ", text.strip())
	if clean not in alerts:
	alerts.append(clean)
	if not alerts:
	alerts = ["No active severe weather alerts detected."]
	return {
	"source": url,
	"alerts": alerts[:10],
	"fetched_at": utc_now().isoformat(),
	}


	def tool_weather_nowcast(location: str = "Colombo") -> Dict[str, Any]:
	"""
	Comprehensive Weather Scraper (Robust Mode):
	1. Homepage (General Text).
	2. City/District Forecast (Direct URL).
	3. Critical Advisory PDFs.
	Handles slow loading by capturing content even if timeouts occur.
	"""
	base_url = "https://meteo.gov.lk/"
	city_forecast_url = "https://meteo.gov.lk/index.php?option=com_content&view=article&id=102&Itemid=360&lang=en"

	combined_report = []
	html_home = ""
	html_city = ""

	if PLAYWRIGHT_AVAILABLE:
	try:
	with sync_playwright() as p:
	browser = p.chromium.launch(headless=True)
	# Use a standard browser context (no aggressive blocking)
	context = browser.new_context(
	user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
	)
	page = context.new_page()
	page.set_default_timeout(60000) # Give it 60 seconds (it's slow)

	# --- A. Visit Homepage ---
	try:
	page.goto(base_url, wait_until="domcontentloaded")
	# Try to wait for text, but don't crash if it takes too long
	try:
	page.wait_for_selector("div.itemFullText", timeout=15000)
	except:
	pass
	html_home = page.content()
	except Exception as e:
	# Even if it times out, grab what we have!
	logger.warning(
	f"[WEATHER] Homepage timeout (capturing partial): {e}"
	)
	html_home = page.content()

	# --- B. Visit City Forecast ---
	try:
	page.goto(city_forecast_url, wait_until="domcontentloaded")
	try:
	page.wait_for_selector("table", timeout=15000)
	except:
	pass
	html_city = page.content()
	except Exception as e:
	logger.warning(
	f"[WEATHER] City Forecast timeout (capturing partial): {e}"
	)
	html_city = page.content()

	browser.close()
	except Exception as e:
	logger.warning(f"[WEATHER] Playwright critical fail: {e}")

	# Fallback to requests if Playwright returned nothing
	if not html_home or len(html_home) < 500:
	resp = _safe_get(base_url)
	html_home = resp.text if resp else ""

	if not html_city or len(html_city) < 500:
	resp = _safe_get(city_forecast_url)
	html_city = resp.text if resp else ""

	if not html_home and not html_city:
	return {"error": "Failed to load Meteo.gov.lk"}

	# --- PARSE HOMEPAGE ---
	soup_home = BeautifulSoup(html_home, "html.parser")
	english_forecast = ""

	header = soup_home.find(string=re.compile(r"WEATHER FORECAST FOR", re.I))
	if header:
	container = header.find_parent("div") or header.find_parent("article")
	if container:
	text = container.get_text(separator="\n", strip=True)
	start = text.upper().find("WEATHER FORECAST FOR")
	if start != -1:
	english_forecast = text[start:][:2500]

	if not english_forecast:
	main = soup_home.find("div", class_="itemFullText") or soup_home.find(
	"div", itemprop="articleBody"
	)
	english_forecast = (
	main.get_text(separator="\n", strip=True)[:2500]
	if main
	else "General forecast text not found."
	)

	combined_report.append("--- ISLAND-WIDE GENERAL FORECAST ---")
	combined_report.append(english_forecast)

	# --- PARSE CITY FORECAST (Districts) ---
	if html_city:
	soup_city = BeautifulSoup(html_city, "html.parser")
	table = soup_city.find("table")
	if table:
	combined_report.append("\n--- DISTRICT/CITY FORECASTS ---")
	rows = table.find_all("tr")

	# Header logic
	if rows:
	header_row = rows[0]
	headers = [
	th.get_text(strip=True) for th in header_row.find_all(["th", "td"])
	]
	if not "".join(headers).strip() and len(rows) > 1:
	headers = [
	th.get_text(strip=True) for th in rows[1].find_all(["th", "td"])
	]

	clean_header = " \| ".join(headers[:4])
	combined_report.append(clean_header)
	combined_report.append("-" * len(clean_header))

	# Row logic
	for row in rows:
	cols = [td.get_text(strip=True) for td in row.find_all("td")]
	if not cols or len(cols) < 2:
	continue
	if "City" in cols[0] or "Temperature" in cols[0]:
	continue

	row_text = " \| ".join(cols[:4])
	combined_report.append(row_text)

	# --- PARSE PDF ALERTS ---
	pdf_links = soup_home.find_all("a", href=True)
	found_pdfs = []
	for a in pdf_links:
	link_text = a.get_text(strip=True)
	href = a["href"]
	if "pdf" in href.lower() and any(
	k in link_text.lower() for k in ["advisory", "warning"]
	):
	abs_url = _make_absolute(href, base_url)
	if abs_url not in [p["url"] for p in found_pdfs]:
	prio = 1 if "english" in link_text.lower() else 2
	found_pdfs.append({"title": link_text, "url": abs_url, "prio": prio})

	found_pdfs.sort(key=lambda x: x["prio"])

	for pdf in found_pdfs[:2]:
	text = _extract_text_from_pdf_url(pdf["url"])
	if "Sinhala/Tamil" not in text and len(text) > 50:
	combined_report.append(f"\n--- CRITICAL ALERT: {pdf['title']} ---\n{text}")

	# Final Cleanup
	final_text = "\n\n".join(combined_report)
	cleanup = ["DEPARTMENT OF METEOROLOGY", "Loading...", "Listen To The Weather"]
	for c in cleanup:
	final_text = final_text.replace(c, "")

	return {
	"location": "All Districts",
	"forecast": final_text,
	"source": base_url,
	"fetched_at": utc_now().isoformat(),
	}


	# ============================================
	# NEWS SCRAPING TOOLS
	# ============================================

	LOCAL_NEWS_SITES = [
	{
	"url": "https://www.dailymirror.lk/",
	"name": "Daily Mirror",
	"article_selector": "article, .news-block, .article, .card",
	},
	{
	"url": "https://www.ft.lk/",
	"name": "Daily FT",
	"article_selector": "article, .article-list-item, .card",
	},
	{
	"url": "https://www.newsfirst.lk/",
	"name": "News First",
	"article_selector": ".post, article, .news-block",
	},
	]


	def scrape_local_news_impl(
	keywords: Optional[List[str]] = None,
	max_articles: int = 30,
	) -> List[Dict[str, Any]]:
	results: List[Dict[str, Any]] = []
	for site in LOCAL_NEWS_SITES:
	try:
	resp = _safe_get(site["url"])
	if not resp:
	logger.warning(f"[NEWS] Failed to fetch {site['url']}")
	continue
	soup = BeautifulSoup(resp.text, "html.parser")
	articles = soup.select(site.get("article_selector", "article"))
	for article in articles:
	title_elem = (
	article.find("h1")
	or article.find("h2")
	or article.find("h3")
	or article.find(
	class_=re.compile(r"(title\|headline\|heading)", re.I)
	)
	)
	title = title_elem.get_text(strip=True) if title_elem else ""
	if not title or len(title) < 8:
	a = article.find("a", href=True)
	title = title or (a.get_text(strip=True) if a else "")
	if not title or len(title) < 8:
	continue
	if not _contains_keyword(title, keywords):
	continue
	link_elem = article.find("a", href=True)
	href = link_elem["href"] if link_elem else site["url"]
	href = _make_absolute(href, site["url"])
	snippet_elem = article.find("p") or article.find(
	class_=re.compile(r"(excerpt\|summary\|description)", re.I)
	)
	snippet = (
	snippet_elem.get_text(strip=True)[:300] if snippet_elem else ""
	)
	results.append(
	{
	"source": site["name"],
	"source_url": site["url"],
	"headline": title,
	"snippet": snippet,
	"url": href,
	"timestamp": utc_now().isoformat(),
	}
	)
	if len(results) >= max_articles:
	return results
	except Exception as e:
	logger.error(f"[NEWS] Error scraping {site['name']}: {e}")
	continue
	return results


	# ============================================
	# REDDIT SCRAPING
	# ============================================


	def scrape_reddit_impl(
	keywords: List[str],
	limit: int = 20,
	subreddit: Optional[str] = None,
	) -> List[Dict[str, Any]]:
	base = (
	f"https://www.reddit.com/r/{subreddit}/search.json"
	if subreddit
	else "https://www.reddit.com/search.json"
	)
	query = " ".join(keywords) if keywords else "Sri Lanka"
	params = {
	"q": query,
	"sort": "new",
	"limit": str(limit),
	"restrict_sr": "on" if subreddit else "off",
	}
	headers = {
	"User-Agent": DEFAULT_HEADERS["User-Agent"],
	"Accept": "application/json",
	}
	try:
	resp = requests.get(
	base, headers=headers, params=params, timeout=DEFAULT_TIMEOUT
	)
	if resp.status_code != 200:
	logger.warning(f"[REDDIT] HTTP {resp.status_code} for {base}")
	return [
	{"error": f"Reddit returned status {resp.status_code}", "query": query}
	]
	data = resp.json()
	posts_raw = data.get("data", {}).get("children", [])
	posts: List[Dict[str, Any]] = []
	for p in posts_raw:
	d = p.get("data", {})
	title = d.get("title") or ""
	selftext = d.get("selftext") or ""
	text = f"{title}\n{selftext}"
	if not _contains_keyword(text, keywords):
	continue
	posts.append(
	{
	"id": d.get("id"),
	"title": title,
	"selftext": selftext[:500],
	"subreddit": d.get("subreddit"),
	"author": d.get("author"),
	"score": d.get("score", 0),
	"url": "https://www.reddit.com" + d.get("permalink", ""),
	"created_utc": d.get("created_utc"),
	"num_comments": d.get("num_comments", 0),
	}
	)
	return (
	posts
	if posts
	else [{"note": f"No Reddit posts found for: {query}", "query": query}]
	)
	except Exception as e:
	logger.error(f"[REDDIT] Error: {e}")
	return [{"error": str(e), "query": query}]


	# ============================================
	# CSE / STOCK DATA
	# ============================================


	def _scrape_cse_website_data(symbol: str) -> Optional[Dict[str, Any]]:
	"""
	Scrape stock data directly from CSE website.
	This is more reliable than yfinance for Sri Lankan stocks.
	"""
	try:
	cse_url = "https://www.cse.lk/"
	resp = _safe_get(cse_url, timeout=30)
	if not resp:
	return None

	soup = BeautifulSoup(resp.text, "html.parser")
	text = soup.get_text(separator="\n", strip=True)

	result_data = {}

	# Pattern for ASPI (All Share Price Index)
	# CSE website typically shows: "ASPI 12,345.67 +123.45 (+1.01%)"
	aspi_patterns = [
	r"ASPI[:\s]([\d,]+\.?\d)\s(?:points?)?\s[\(\[]?([+-]?[\d,]+\.?\d)\s(?:points?)?[\)\]]?\s[\(\[]?([+-]?[\d,]\.?\d*)%?[\)\]]?",
	r"All\sShare\s(?:Price\s)?Index[:\s]([\d,]+\.?\d*)",
	r"ASPI[^\d\n\r]*([\d,]+\.\d+)",
	]

	for pattern in aspi_patterns:
	m = re.search(pattern, text, re.I)
	if m:
	try:
	value = float(m.group(1).replace(",", ""))
	result_data["aspi"] = {
	"value": value,
	"change": (
	float(m.group(2).replace(",", ""))
	if len(m.groups()) > 1 and m.group(2)
	else None
	),
	"change_pct": (
	float(m.group(3).replace(",", "").replace("%", ""))
	if len(m.groups()) > 2 and m.group(3)
	else None
	),
	}
	break
	except (ValueError, IndexError):
	continue

	# Pattern for S&P SL20 index
	sp_patterns = [
	r"S&?P\sSL\s20[:\s]([\d,]+\.?\d)",
	r"SL20[:\s]([\d,]+\.?\d)",
	]

	for pattern in sp_patterns:
	m = re.search(pattern, text, re.I)
	if m:
	try:
	result_data["sp_sl20"] = float(m.group(1).replace(",", ""))
	break
	except ValueError:
	continue

	# Check if we got any useful data
	if result_data:
	return result_data

	# Fallback: simple ASPI pattern
	m = re.search(
	r"(ASPI\|All Share Price Index)[^\d\n\r]*([\d,]+\.\d+)", text, re.I
	)
	if m:
	return {"aspi": {"value": float(m.group(2).replace(",", ""))}}

	return None

	except Exception as e:
	logger.debug(f"[CSE] Direct CSE scrape failed: {e}")
	return None


	def scrape_cse_stock_impl(
	symbol: str = "ASPI",
	period: str = "1d",
	interval: str = "1h",
	) -> Dict[str, Any]:
	"""
	Fetch CSE stock data with multiple fallback strategies:
	1. First try direct CSE website scraping (most reliable for Sri Lankan stocks)
	2. Fall back to yfinance if direct scraping fails

	Note: yfinance often fails for CSE symbols as Yahoo Finance has limited
	coverage of the Colombo Stock Exchange.
	"""
	symbol_upper = symbol.upper()
	is_index = symbol_upper in ("ASPI", "ASPI.N0000", "^N0000", "ALL SHARE")

	# ============ Strategy 1: Direct CSE Website Scraping ============
	# This is more reliable for Sri Lankan market data
	if is_index:
	logger.info(f"[CSE] Attempting direct CSE website scrape for {symbol}...")
	cse_data = _scrape_cse_website_data(symbol)

	if cse_data and "aspi" in cse_data:
	aspi_info = cse_data["aspi"]
	summary = {
	"current_price": aspi_info.get("value", 0),
	"change": aspi_info.get("change"),
	"change_pct": aspi_info.get("change_pct"),
	}

	# Add S&P SL20 if available
	if "sp_sl20" in cse_data:
	summary["sp_sl20"] = cse_data["sp_sl20"]

	logger.info(
	f"[CSE] Successfully scraped ASPI from CSE website: {summary['current_price']}"
	)
	return {
	"symbol": symbol,
	"resolved_symbol": "CSE-direct",
	"period": period,
	"interval": interval,
	"summary": summary,
	"records": [],
	"source": "cse.lk (direct scrape)",
	"note": "Real-time data from Colombo Stock Exchange website",
	"fetched_at": utc_now().isoformat(),
	}

	# ============ Strategy 2: yfinance (Fallback) ============
	# Note: This frequently fails for CSE stocks
	symbols_to_try = [symbol]
	if is_index:
	symbols_to_try = ["^N0000", "ASPI.N0000", "ASPI"]
	elif not symbol.endswith(".N0000") and not symbol.startswith("^"):
	# Try both with and without .N0000 suffix for regular stocks
	symbols_to_try = [f"{symbol}.N0000", symbol]

	logger.info(f"[CSE] Trying yfinance for symbols: {symbols_to_try}")

	for sym in symbols_to_try:
	try:
	ticker = yf.Ticker(sym)
	hist = ticker.history(period=period, interval=interval)

	if hist is None or hist.empty:
	logger.debug(f"[CSE] yfinance returned empty data for {sym}")
	continue

	hist = hist.reset_index()
	records = hist.to_dict(orient="records")

	for record in records:
	for key, value in list(record.items()):
	if hasattr(value, "isoformat"):
	record[key] = value.isoformat()

	latest = records[-1] if records else {}
	summary = {
	"current_price": latest.get("Close", latest.get("close", 0)),
	"open": latest.get("Open", latest.get("open", 0)),
	"high": latest.get("High", latest.get("high", 0)),
	"low": latest.get("Low", latest.get("low", 0)),
	"volume": latest.get("Volume", latest.get("volume", 0)),
	}

	logger.info(f"[CSE] yfinance success for {sym}: {summary['current_price']}")
	return {
	"symbol": symbol,
	"resolved_symbol": sym,
	"period": period,
	"interval": interval,
	"summary": summary,
	"records": records[-10:],
	"source": "yahoo_finance",
	"fetched_at": utc_now().isoformat(),
	}

	except Exception as e_inner:
	logger.debug(f"[CSE] yfinance attempt failed for {sym}: {e_inner}")
	continue

	# ============ Final Fallback: Try CSE website again for any symbol ============
	logger.info("[CSE] All yfinance attempts failed, trying CSE website fallback...")
	cse_data = _scrape_cse_website_data(symbol)

	if cse_data and "aspi" in cse_data:
	return {
	"symbol": symbol,
	"resolved_symbol": "CSE-fallback",
	"period": period,
	"interval": interval,
	"summary": {"current_price": cse_data["aspi"].get("value", 0)},
	"records": [],
	"source": "cse.lk (fallback scrape)",
	"fetched_at": utc_now().isoformat(),
	}

	# All strategies failed
	logger.warning(f"[CSE] All data sources failed for {symbol}")
	return {
	"symbol": symbol,
	"error": f"Could not fetch data for {symbol}. Yahoo Finance has limited CSE coverage.",
	"attempted_symbols": symbols_to_try,
	"suggestion": "Try accessing cse.lk directly for real-time CSE data",
	"fetched_at": utc_now().isoformat(),
	}


	# ============================================
	# GOVERNMENT GAZETTE (Deep Scraping)
	# ============================================


	def scrape_government_gazette_impl(
	keywords: Optional[List[str]] = None,
	max_items: int = 15,
	) -> List[Dict[str, Any]]:
	"""
	Scrapes gazette.lk for latest government gazettes.
	ENHANCED: Now downloads PDFs and extracts text content from them.

	Args:
	keywords: Optional list of keywords to filter gazettes (currently ignored)
	max_items: Maximum number of gazette entries to process

	Returns:
	List of gazette entries with PDF content extracted
	"""
	base_url = "https://www.gazette.lk/government-gazette"
	results: List[Dict[str, Any]] = []

	logger.info(f"[GAZETTE] Fetching latest gazettes from {base_url}")
	resp = _safe_get(base_url)
	if not resp:
	return [
	{
	"title": "Failed to access gazette.lk",
	"url": base_url,
	"error": "Network request failed",
	"timestamp": utc_now().isoformat(),
	}
	]

	soup = BeautifulSoup(resp.text, "html.parser")

	# Find all gazette article entries
	articles = soup.find_all("article")
	if not articles:
	articles = soup.select(".post, .type-post, .entry")

	logger.info(f"[GAZETTE] Found {len(articles)} potential gazette entries")

	for article in articles:
	if len(results) >= max_items:
	break

	# Extract title and link
	title_elem = (
	article.find(class_="entry-title")
	or article.find("h2")
	or article.find("h3")
	)
	if not title_elem:
	continue

	link_elem = title_elem.find("a", href=True)
	if not link_elem:
	continue

	title = link_elem.get_text(strip=True)
	post_url = link_elem["href"]
	post_url_abs = _make_absolute(post_url, base_url)

	# Filter to only include actual gazette entries (not other site content)
	if "government gazette" not in title.lower():
	continue

	# Extract date from title if possible
	date_match = re.search(r"(\d{4}\s+\w+\s+\d{1,2})", title)
	date_str = date_match.group(1) if date_match else "Unknown date"

	logger.info(f"[GAZETTE] Processing: {title[:50]}...")

	# ENHANCED: Visit the detail page to find all PDF links
	pdf_links = []
	pdf_content = []

	try:
	detail_resp = _safe_get(post_url_abs)
	if detail_resp:
	detail_soup = BeautifulSoup(detail_resp.text, "html.parser")

	# FIXED: First look for pdfemb-viewer class links (gazette.lk specific)
	# These have direct PDF URLs like https://www.gazette.lk/dl/Gazette/11/Gazette-2025-11-28E.pdf
	pdfemb_links = detail_soup.find_all("a", class_="pdfemb-viewer")
	for link in pdfemb_links:
	href = link.get("href", "")
	if href and ("/dl/Gazette/" in href or ".pdf" in href.lower()):
	# Detect language from URL (E=English, S=Sinhala, T=Tamil)
	language = "english"
	href_lower = href.lower()
	if href.endswith("S.pdf") or "sinhala" in href_lower:
	language = "sinhala"
	elif href.endswith("T.pdf") or "tamil" in href_lower:
	language = "tamil"

	pdf_url = _make_absolute(href, post_url_abs)
	pdf_links.append(
	{
	"language": language,
	"url": pdf_url,
	"text": link.get_text(strip=True)
	or f"Gazette PDF ({language})",
	}
	)
	logger.info(f"[GAZETTE] Found pdfemb-viewer link: {pdf_url}")

	# Also look for any other direct PDF links (backup approach)
	if not pdf_links:
	for link in detail_soup.find_all("a", href=True):
	href = link["href"]
	link_text = link.get_text(strip=True).lower()

	# Check for direct PDF download paths
	is_gazette_pdf = "/dl/Gazette/" in href
	is_pdf_file = href.lower().endswith(".pdf")

	if is_gazette_pdf or is_pdf_file:
	pdf_url = _make_absolute(href, post_url_abs)

	# Detect language
	language = "english"
	if "sinhala" in link_text or href.endswith("S.pdf"):
	language = "sinhala"
	elif "tamil" in link_text or href.endswith("T.pdf"):
	language = "tamil"
	elif href.endswith("E.pdf") or "english" in link_text:
	language = "english"

	# Avoid duplicates
	if not any(p["url"] == pdf_url for p in pdf_links):
	pdf_links.append(
	{
	"language": language,
	"url": pdf_url,
	"text": link.get_text(strip=True)
	or f"PDF ({language})",
	}
	)

	logger.info(
	f"[GAZETTE] Found {len(pdf_links)} PDF links on detail page"
	)

	# ENHANCED: Download and extract text from English PDFs (most useful)
	english_pdfs = [p for p in pdf_links if p["language"] == "english"]
	if not english_pdfs:
	english_pdfs = pdf_links[:1] # Fallback to first PDF

	for pdf_info in english_pdfs[:2]: # Limit to 2 PDFs per gazette
	try:
	logger.info(
	f"[GAZETTE] Downloading PDF: {pdf_info['url'][:60]}..."
	)
	extracted_text = _extract_text_from_pdf_url(pdf_info["url"])

	if extracted_text and not extracted_text.startswith("["):
	pdf_content.append(
	{
	"language": pdf_info["language"],
	"content": extracted_text, # Full content - no truncation
	"source_url": pdf_info["url"],
	}
	)
	logger.info(
	f"[GAZETTE] Extracted {len(extracted_text)} chars from PDF"
	)
	else:
	pdf_content.append(
	{
	"language": pdf_info["language"],
	"content": extracted_text,
	"source_url": pdf_info["url"],
	}
	)
	except Exception as e:
	logger.warning(f"[GAZETTE] PDF extraction error: {e}")
	pdf_content.append(
	{
	"language": pdf_info.get("language", "unknown"),
	"content": f"[Error extracting PDF: {str(e)}]",
	"source_url": pdf_info.get("url", ""),
	}
	)
	except Exception as e:
	logger.warning(f"[GAZETTE] Error fetching detail page: {e}")

	# Build the result with extracted content
	result_entry = {
	"title": title,
	"date": date_str,
	"url": post_url_abs,
	"pdf_links": pdf_links,
	"extracted_content": pdf_content,
	"timestamp": utc_now().isoformat(),
	}

	# Add a summary if we have content
	if pdf_content:
	first_content = pdf_content[0].get("content", "")
	if first_content and not first_content.startswith("["):
	result_entry["summary"] = first_content[:500]

	results.append(result_entry)
	logger.info(f"[GAZETTE] Added gazette with {len(pdf_content)} PDF extractions")

	if not results:
	return [
	{
	"title": "No gazette entries found",
	"url": base_url,
	"note": "The website structure may have changed",
	"timestamp": utc_now().isoformat(),
	}
	]

	logger.info(
	f"[GAZETTE] Successfully scraped {len(results)} gazette entries with PDF content"
	)
	return results


	# ============================================
	# PARLIAMENT MINUTES
	# ============================================


	def scrape_parliament_minutes_impl(
	keywords: Optional[List[str]] = None,
	max_items: int = 20,
	) -> List[Dict[str, Any]]:
	"""
	Scrape Sri Lankan Parliament Hansards from parliament.lk.

	ENHANCED: Now properly extracts Hansard PDF links with dates and metadata.
	The website stores PDFs at /uploads/businessdocs/ with date-encoded filenames.

	Args:
	keywords: Optional keywords to filter results
	max_items: Maximum number of items to return

	Returns:
	List of Hansard entries with PDF links and dates
	"""
	url = "https://www.parliament.lk/en/business-of-parliament/hansards"

	logger.info(f"[PARLIAMENT] Fetching Hansards from {url}")
	resp = _safe_get(url)

	if not resp:
	return [
	{
	"title": "Parliament website unavailable",
	"url": url,
	"note": "Could not access parliament.lk. Site may be down.",
	"timestamp": utc_now().isoformat(),
	}
	]

	soup = BeautifulSoup(resp.text, "html.parser")
	results: List[Dict[str, Any]] = []

	# Strategy 1: Look for PDF links in /uploads/businessdocs/ (Hansard documents)
	pdf_links = soup.find_all(
	"a", href=lambda x: x and ".pdf" in x.lower() and "businessdocs" in x.lower()
	)

	logger.info(f"[PARLIAMENT] Found {len(pdf_links)} Hansard PDF links")

	for link in pdf_links:
	href = link.get("href", "")
	link_text = link.get_text(strip=True)

	# Extract date from URL (e.g., 22912_english_2025-11-17.pdf)
	date_match = re.search(r"(\d{4}-\d{2}-\d{2})", href)
	date_str = date_match.group(1) if date_match else None

	# Extract language from URL
	language = "english"
	href_lower = href.lower()
	if "sinhala" in href_lower:
	language = "sinhala"
	elif "tamil" in href_lower:
	language = "tamil"

	# Extract document ID from URL
	doc_id_match = re.search(r"/(\d+)_", href)
	doc_id = doc_id_match.group(1) if doc_id_match else None

	# Build title
	if date_str:
	title = f"Hansard - {date_str} ({language.capitalize()})"
	else:
	title = f"Hansard ({language.capitalize()})"

	# Find parent element for additional context
	parent = link.find_parent(["tr", "li", "div", "article"])
	if parent:
	parent_text = parent.get_text(separator=" ", strip=True)
	# Look for session info in parent
	session_match = re.search(
	r"(Session\|Sitting\|Day)\s[:\-]?\s(\d+)", parent_text, re.I
	)
	if session_match:
	title += f" - {session_match.group(0)}"

	# Apply keyword filter if specified
	full_text = f"{title} {href} {link_text}"
	if keywords and not _contains_keyword(full_text, keywords):
	continue

	# Construct absolute URL
	pdf_url = _make_absolute(href, url)

	entry = {
	"title": title,
	"url": pdf_url,
	"date": date_str,
	"language": language,
	"document_id": doc_id,
	"link_text": link_text,
	"timestamp": utc_now().isoformat(),
	}

	# Avoid duplicates (same doc, different language links)
	if not any(r.get("url") == pdf_url for r in results):
	results.append(entry)

	if len(results) >= max_items:
	break

	# Strategy 2: If no PDFs found, fall back to general link search
	if not results:
	logger.info("[PARLIAMENT] No PDF links found, trying general link search...")
	for a in soup.find_all("a", href=True):
	title = a.get_text(strip=True)
	href = a["href"]

	if not title or len(title) < 6:
	continue

	# Must match hansard-related keywords
	combined = f"{title} {href}".lower()
	if not re.search(
	r"(hansard\|minutes\|debate\|transcript\|proceedings)", combined
	):
	continue

	# Apply user keyword filter
	if keywords and not _contains_keyword(title, keywords):
	continue

	href_abs = _make_absolute(href, url)

	# Avoid duplicates
	if any(r.get("url") == href_abs for r in results):
	continue

	results.append(
	{
	"title": title,
	"url": href_abs,
	"timestamp": utc_now().isoformat(),
	}
	)

	if len(results) >= max_items:
	break

	if not results:
	return [
	{
	"title": "No parliament Hansards found",
	"url": url,
	"keywords": keywords,
	"note": "The website structure may have changed or no matching documents found.",
	"timestamp": utc_now().isoformat(),
	}
	]

	logger.info(f"[PARLIAMENT] Successfully scraped {len(results)} Hansard entries")
	return results


	# ============================================
	# TRAIN SCHEDULE
	# ============================================


	def scrape_train_schedule_impl(
	from_station: Optional[str] = None,
	to_station: Optional[str] = None,
	keyword: Optional[str] = None,
	max_items: int = 30,
	) -> List[Dict[str, Any]]:
	url = "https://eservices.railway.gov.lk/schedule/homeAction.action?lang=en"
	resp = _safe_get(url)
	if not resp:
	return [
	{
	"train": "Railway website unavailable",
	"note": "Could not access railway.gov.lk",
	"timestamp": utc_now().isoformat(),
	}
	]
	soup = BeautifulSoup(resp.text, "html.parser")
	tables = soup.find_all("table")
	results: List[Dict[str, Any]] = []
	for table in tables:
	rows = table.find_all("tr")
	for row in rows[1:]:
	cols = [td.get_text(strip=True) for td in row.find_all("td")]
	if len(cols) < 2:
	continue
	train_info = {
	"train": cols[0] if len(cols) > 0 else "",
	"departure": cols[1] if len(cols) > 1 else "",
	"arrival": cols[2] if len(cols) > 2 else "",
	"route": " → ".join(cols[3:]) if len(cols) > 3 else "",
	}
	combined = " ".join(cols)
	if from_station and from_station.lower() not in combined.lower():
	continue
	if to_station and to_station.lower() not in combined.lower():
	continue
	if keyword and keyword.lower() not in combined.lower():
	continue
	results.append(train_info)
	if len(results) >= max_items:
	break
	if not results:
	return [
	{
	"train": "No train schedules found",
	"note": "Railway schedule unavailable or no matches",
	"timestamp": utc_now().isoformat(),
	}
	]
	return results


	# ============================================
	# TWITTER TRENDING
	# ============================================


	def _scrape_twitter_trending_with_playwright(
	storage_state_path: Optional[str] = None, headless: bool = True
	) -> List[Dict[str, Any]]:
	ensure_playwright()
	trending = []
	with sync_playwright() as p:
	browser = p.chromium.launch(headless=headless)
	context_args = {}
	if storage_state_path and os.path.exists(storage_state_path):
	context_args["storage_state"] = storage_state_path

	context = browser.new_context(**context_args)
	page = context.new_page()
	try:
	page.goto(
	"https://twitter.com/i/trends", wait_until="networkidle", timeout=30000
	)
	if "login" in page.url or page.content().strip() == "":
	page.goto(
	"https://twitter.com/explore/tabs/trending",
	wait_until="networkidle",
	timeout=30000,
	)
	html = page.content()
	soup = BeautifulSoup(html, "html.parser")
	items = soup.select(
	"div[role='article'] a, div[data-testid='trend'], div.trend-card, span.trend-name"
	)
	seen = set()
	for it in items:
	text = it.get_text(separator=" ", strip=True)
	href = it.get("href") or ""
	if not text or len(text) < 2:
	continue
	if text in seen:
	continue
	seen.add(text)
	trending.append(
	{
	"trend": text,
	"url": (
	_make_absolute(href, "https://twitter.com")
	if href
	else None
	),
	}
	)

	if not trending:
	for tag in soup.find_all(string=re.compile(r"#\w+")):
	t = tag.strip()
	if t not in seen:
	trending.append({"trend": t, "url": None})
	seen.add(t)
	return trending
	except Exception as e:
	logger.error(f"[TWITTER] Playwright trending error: {e}")
	return []
	finally:
	try:
	context.close()
	except Exception:
	pass
	browser.close()


	def _scrape_twitter_trending_with_nitter(
	instance: str = "https://nitter.net",
	) -> List[Dict[str, Any]]:
	trends = []
	try:
	search_url = f"{instance}/search?f=tweets&q=Sri%20Lanka%20trend"
	resp = _safe_get(search_url)
	if not resp:
	return []
	soup = BeautifulSoup(resp.text, "html.parser")
	for a in soup.select("a:not([href^='/pic/'])"):
	text = a.get_text(separator=" ", strip=True)
	href = a.get("href", "")
	if not text:
	continue
	if len(text) < 3:
	continue
	trends.append({"trend": text, "url": _make_absolute(href, instance)})
	return trends[:20]
	except Exception as e:
	logger.debug(f"[TWITTER] Nitter fallback failed: {e}")
	return []


	def scrape_twitter_trending_srilanka(
	use_playwright: bool = True, storage_state_site: Optional[str] = None
	) -> Dict[str, Any]:
	if use_playwright and PLAYWRIGHT_AVAILABLE:
	storage_state = None
	if storage_state_site:
	storage_state = load_playwright_storage_state_path(storage_state_site)
	try:
	trends = _scrape_twitter_trending_with_playwright(
	storage_state_path=storage_state
	)
	if trends:
	return {
	"source": "twitter_playwright",
	"trends": trends,
	"fetched_at": utc_now().isoformat(),
	}
	except Exception as e:
	logger.debug(f"[TWITTER] Playwright attempt failed: {e}")

	nitter_instances = [
	"https://nitter.net",
	"https://nitter.snopyta.org",
	"https://nitter.1d4.us",
	]
	for inst in nitter_instances:
	try:
	trends = _scrape_twitter_trending_with_nitter(inst)
	if trends:
	return {
	"source": inst,
	"trends": trends,
	"fetched_at": utc_now().isoformat(),
	}
	except Exception:
	continue

	return {
	"source": "none",
	"trends": [],
	"note": "Could not fetch Twitter trends. Try supplying Playwright session or check network.",
	}


	# ============================================
	# AUTHENTICATED SCRAPERS
	# ============================================


	def scrape_authenticated_page_via_playwright(
	site_name: str,
	url: str,
	login_flow: Optional[dict] = None,
	headless: bool = True,
	storage_dir: str = ".sessions",
	wait_until: str = "networkidle",
	) -> Dict[str, Any]:
	if not PLAYWRIGHT_AVAILABLE:
	return {
	"error": "Playwright not available. Install playwright to use authenticated scrapers."
	}

	session_path = load_playwright_storage_state_path(site_name, storage_dir)

	if not session_path:
	if not login_flow:
	return {
	"error": f"No existing session found for {site_name} and no login_flow provided to create one."
	}
	try:
	session_path = create_or_restore_playwright_session(
	site_name,
	login_flow=login_flow,
	headless=headless,
	storage_dir=storage_dir,
	wait_until=wait_until,
	)
	except Exception as e:
	return {"error": f"Failed to create Playwright session: {e}"}

	html = playwright_fetch_html_using_session(
	url, session_path, headless=headless, wait_until=wait_until
	)
	if not html:
	return {
	"error": "Failed to fetch page via Playwright session.",
	"storage_state": session_path,
	}
	return {"html": html, "source": url, "storage_state": session_path}


	def _simple_parse_posts_from_html(
	html: str, base_url: str, max_items: int = 10
	) -> List[Dict[str, Any]]:
	soup = BeautifulSoup(html, "html.parser")
	items: List[Dict[str, Any]] = []
	candidates = soup.select(
	"article, div.post, div.feed-item, li.stream-item, div._4ikz"
	)
	if not candidates:
	candidates = soup.find_all(["article", "div"], limit=200)
	seen = set()
	for c in candidates:
	title_tag = c.find("h1") or c.find("h2") or c.find("h3") or c.find("a")
	if not title_tag:
	continue
	title = title_tag.get_text(strip=True)
	if not title or title in seen or len(title) < 4:
	continue
	seen.add(title)
	a = c.find("a", href=True)
	url = _make_absolute(a["href"], base_url) if a else base_url
	text = c.get_text(separator=" ", strip=True)[:500]
	items.append({"title": title, "snippet": text, "url": url})
	if len(items) >= max_items:
	break
	return items


	# ============================================
	# LANGCHAIN TOOL WRAPPERS
	# ============================================


	def clean_linkedin_text(text):
	if not text:
	return ""

	# Remove "…see more" and "See translation"
	text = re.sub(r"…\s*see more", "", text, flags=re.IGNORECASE)
	text = re.sub(r"See translation", "", text, flags=re.IGNORECASE)
	text = re.sub(r"\b\d+[dwmo]\s•\s(Edited)?\s*•?", "", text)
	text = re.sub(r".+posted this", "", text)
	text = re.sub(r"\d+[\.,]?\d\sreactions", "", text)
	text = "\n".join([line.strip() for line in text.splitlines() if line.strip()])

	return text.strip()


	@tool
	def scrape_linkedin(keywords: Optional[List[str]] = None, max_items: int = 10):
	"""
	LinkedIn search using Playwright session.
	Requires environment variables: LINKEDIN_USER, LINKEDIN_PASSWORD (if creating session).
	"""
	ensure_playwright()

	# 1. Load Session
	site = "linkedin"
	session_path = load_playwright_storage_state_path(
	site, out_dir="src/utils/.sessions"
	)
	if not session_path:
	session_path = load_playwright_storage_state_path(site, out_dir=".sessions")

	# If no session, try to create one
	if not session_path:
	login_flow = {
	"login_url": "https://www.linkedin.com/login",
	"steps": [
	{
	"type": "fill",
	"selector": 'input[name="session_key"]',
	"value_env": "LINKEDIN_USER",
	},
	{
	"type": "fill",
	"selector": 'input[name="session_password"]',
	"value_env": "LINKEDIN_PASSWORD",
	},
	{"type": "click", "selector": 'button[type="submit"]'},
	{"type": "wait", "selector": "nav", "timeout": 20000},
	],
	}
	try:
	session_path = create_or_restore_playwright_session(
	site, login_flow=login_flow, headless=True
	)
	except Exception as e:
	return json.dumps(
	{"error": f"No session found and failed to create one: {e}"}
	)

	keyword = " ".join(keywords) if keywords else "Sri Lanka"
	results = []

	try:
	with sync_playwright() as p:
	desktop_ua = (
	"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
	"AppleWebKit/537.36 (KHTML, like Gecko) "
	"Chrome/120.0.0.0 Safari/537.36"
	)

	browser = p.chromium.launch(
	headless=True,
	args=[
	"--disable-blink-features=AutomationControlled",
	"--start-maximized",
	],
	)

	context = browser.new_context(
	storage_state=session_path, user_agent=desktop_ua, no_viewport=True
	)

	page = context.new_page()
	page.add_init_script(
	"Object.defineProperty(navigator, 'webdriver', {get: () => undefined})"
	)

	url = f"https://www.linkedin.com/search/results/content/?keywords={keyword.replace(' ', '%20')}&origin=GLOBAL_SEARCH_HEADER"

	try:
	logger.info(f"[LINKEDIN] Navigating to {url}")
	page.goto(url, timeout=60000, wait_until="domcontentloaded")
	except Exception as e:
	logger.warning(
	f"[LINKEDIN] Page load timed out (or other error), attempting to proceed: {e}"
	)

	page.wait_for_timeout(random.randint(4000, 7000))

	try:
	if (
	page.locator("a[href*='login']").is_visible()
	or "auth_wall" in page.url
	):
	logger.error(
	"[LINKEDIN] Session invalid. Redirected to login/auth wall."
	)
	return json.dumps(
	{"error": "Session invalid. Please refresh session."}
	)
	except:
	pass

	seen = set()
	no_new_data_count = 0
	previous_height = 0

	POST_CONTAINER_SELECTOR = "div.feed-shared-update-v2, li.artdeco-card"
	TEXT_SELECTOR = (
	"div.update-components-text span.break-words, span.break-words"
	)
	SEE_MORE_SELECTOR = (
	"button.feed-shared-inline-show-more-text__see-more-less-toggle"
	)
	POSTER_SELECTOR = "span.update-components-actor__name span[dir='ltr']"

	while len(results) < max_items:
	try:
	see_more_buttons = page.locator(SEE_MORE_SELECTOR).all()
	for btn in see_more_buttons:
	if btn.is_visible():
	try:
	btn.click(timeout=500)
	except:
	pass
	except:
	pass

	if len(results) == 0:
	try:
	page.locator(POST_CONTAINER_SELECTOR).first.wait_for(
	timeout=5000
	)
	except:
	logger.warning("[LINKEDIN] No posts found on page yet.")

	posts = page.locator(POST_CONTAINER_SELECTOR).all()

	for post in posts:
	if len(results) >= max_items:
	break
	try:
	post.scroll_into_view_if_needed()
	raw_text = ""
	text_el = post.locator(TEXT_SELECTOR).first
	if text_el.is_visible():
	raw_text = text_el.inner_text()
	else:
	raw_text = post.locator(
	"div.feed-shared-update-v2__description-wrapper"
	).first.inner_text()

	cleaned_text = clean_linkedin_text(raw_text)
	poster_name = "(Unknown)"
	poster_el = post.locator(POSTER_SELECTOR).first
	if poster_el.is_visible():
	poster_name = poster_el.inner_text().strip()
	else:
	poster_el = post.locator(
	"span.update-components-actor__title span[dir='ltr']"
	).first
	if poster_el.is_visible():
	poster_name = poster_el.inner_text().strip()

	key = f"{poster_name[:20]}::{cleaned_text[:30]}"
	if cleaned_text and len(cleaned_text) > 20 and key not in seen:
	seen.add(key)
	results.append(
	{
	"source": "LinkedIn",
	"poster": poster_name,
	"text": cleaned_text,
	"url": "https://www.linkedin.com",
	}
	)
	logger.info(f"[LINKEDIN] Found post by {poster_name}")
	except Exception:
	continue

	page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
	page.wait_for_timeout(random.randint(2000, 4000))

	new_height = page.evaluate("document.body.scrollHeight")
	if new_height == previous_height:
	no_new_data_count += 1
	if no_new_data_count > 3:
	logger.info("[LINKEDIN] End of feed or stuck.")
	break
	else:
	no_new_data_count = 0
	previous_height = new_height

	browser.close()
	return json.dumps(
	{"site": "LinkedIn", "results": results, "storage_state": session_path},
	default=str,
	)

	except Exception as e:
	return json.dumps({"error": str(e)})


	# =====================================================
	# 🔧 TWITTER UTILITY FUNCTIONS
	# =====================================================


	def clean_twitter_text(text):
	"""Clean and normalize tweet text"""
	if not text:
	return ""

	# Remove common Twitter artifacts
	text = re.sub(r"Show more", "", text, flags=re.IGNORECASE)
	text = re.sub(r"https://t\.co/\w+", "", text) # Remove t.co links
	text = re.sub(r"pic\.twitter\.com/\w+", "", text) # Remove pic.twitter.com links
	text = re.sub(r"\s+", " ", text) # Normalize whitespace
	text = "\n".join([line.strip() for line in text.splitlines() if line.strip()])

	return text.strip()


	def extract_twitter_timestamp(tweet_element):
	"""Extract timestamp from tweet element"""
	try:
	timestamp_selectors = [
	"time",
	"[datetime]",
	"a[href*='/status/'] time",
	"div[data-testid='User-Name'] a[href*='/status/']",
	]

	for selector in timestamp_selectors:
	if tweet_element.locator(selector).count() > 0:
	time_element = tweet_element.locator(selector).first
	datetime_attr = time_element.get_attribute("datetime")
	if datetime_attr:
	return datetime_attr
	time_text = time_element.inner_text()
	if time_text:
	return time_text
	except:
	pass
	return "Unknown"


	@tool
	def scrape_twitter(query: str = "Sri Lanka", max_items: int = 20):
	"""
	Twitter scraper - extracts actual tweet text, author, and metadata using Playwright session.
	Requires a valid Twitter session file (twitter_storage_state.json or tw_state.json).
	"""
	ensure_playwright()

	# Load Session
	site = "twitter"
	session_path = load_playwright_storage_state_path(
	site, out_dir="src/utils/.sessions"
	)
	if not session_path:
	session_path = load_playwright_storage_state_path(site, out_dir=".sessions")

	# Check for alternative session file name
	if not session_path:
	alt_paths = [
	os.path.join(os.getcwd(), "src", "utils", ".sessions", "tw_state.json"),
	os.path.join(os.getcwd(), ".sessions", "tw_state.json"),
	os.path.join(os.getcwd(), "tw_state.json"),
	]
	for path in alt_paths:
	if os.path.exists(path):
	session_path = path
	logger.info(f"[TWITTER] Found session at {path}")
	break

	if not session_path:
	return json.dumps(
	{
	"error": "No Twitter session found",
	"solution": "Run the Twitter session manager to create a session",
	},
	default=str,
	)

	results = []

	try:
	with sync_playwright() as p:
	browser = p.chromium.launch(
	headless=True,
	args=[
	"--disable-blink-features=AutomationControlled",
	"--no-sandbox",
	"--disable-dev-shm-usage",
	],
	)

	context = browser.new_context(
	storage_state=session_path,
	viewport={"width": 1280, "height": 720},
	user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
	)

	context.add_init_script(
	"""
	Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
	window.chrome = {runtime: {}};
	"""
	)

	page = context.new_page()

	# Try different search URLs
	search_urls = [
	f"https://x.com/search?q={quote_plus(query)}&src=typed_query&f=live",
	f"https://x.com/search?q={quote_plus(query)}&src=typed_query",
	f"https://x.com/search?q={quote_plus(query)}",
	]

	success = False
	for url in search_urls:
	try:
	logger.info(f"[TWITTER] Trying {url}")
	page.goto(url, timeout=60000, wait_until="domcontentloaded")
	time.sleep(5)

	# Handle popups
	popup_selectors = [
	"[data-testid='app-bar-close']",
	"[aria-label='Close']",
	"button:has-text('Not now')",
	]
	for selector in popup_selectors:
	try:
	if (
	page.locator(selector).count() > 0
	and page.locator(selector).first.is_visible()
	):
	page.locator(selector).first.click()
	time.sleep(1)
	except:
	pass

	# Wait for tweets
	try:
	page.wait_for_selector(
	"article[data-testid='tweet']", timeout=15000
	)
	logger.info("[TWITTER] Tweets found!")
	success = True
	break
	except:
	logger.warning("[TWITTER] No tweets found, trying next URL...")
	continue
	except Exception as e:
	logger.error(f"[TWITTER] Navigation failed: {e}")
	continue

	if not success or "login" in page.url:
	logger.error("[TWITTER] Could not load tweets or session expired")
	return json.dumps(
	{"error": "Session invalid or tweets not found"}, default=str
	)

	# Scraping
	seen = set()
	scroll_attempts = 0
	max_scroll_attempts = 15

	TWEET_SELECTOR = "article[data-testid='tweet']"
	TEXT_SELECTOR = "div[data-testid='tweetText']"
	USER_SELECTOR = "div[data-testid='User-Name']"

	while len(results) < max_items and scroll_attempts < max_scroll_attempts:
	scroll_attempts += 1

	# Expand "Show more" buttons
	try:
	show_more_buttons = page.locator(
	"[data-testid='tweet-text-show-more-link']"
	).all()
	for button in show_more_buttons:
	if button.is_visible():
	try:
	button.click()
	time.sleep(0.3)
	except:
	pass
	except:
	pass

	# Collect tweets
	tweets = page.locator(TWEET_SELECTOR).all()
	new_tweets_found = 0

	for tweet in tweets:
	if len(results) >= max_items:
	break

	try:
	tweet.scroll_into_view_if_needed()
	time.sleep(0.1)

	# Skip promoted tweets
	if (
	tweet.locator("span:has-text('Promoted')").count() > 0
	or tweet.locator("span:has-text('Ad')").count() > 0
	):
	continue

	# Extract text
	text_content = ""
	text_element = tweet.locator(TEXT_SELECTOR).first
	if text_element.count() > 0:
	text_content = text_element.inner_text()

	cleaned_text = clean_twitter_text(text_content)

	# Extract user
	user_info = "Unknown"
	user_element = tweet.locator(USER_SELECTOR).first
	if user_element.count() > 0:
	user_text = user_element.inner_text()
	user_info = user_text.split("\n")[0].strip()

	# Extract timestamp
	timestamp = extract_twitter_timestamp(tweet)

	# Deduplication
	text_key = cleaned_text[:50] if cleaned_text else ""
	unique_key = f"{user_info}_{text_key}"

	if (
	cleaned_text
	and len(cleaned_text) > 20
	and unique_key not in seen
	and not any(
	word in cleaned_text.lower()
	for word in ["promoted", "advertisement"]
	)
	):

	seen.add(unique_key)
	results.append(
	{
	"source": "Twitter",
	"poster": user_info,
	"text": cleaned_text,
	"timestamp": timestamp,
	"url": "https://x.com",
	}
	)
	new_tweets_found += 1
	logger.info(
	f"[TWITTER] Collected tweet {len(results)}/{max_items}"
	)

	except Exception:
	continue

	# Scroll down
	if len(results) < max_items:
	page.evaluate(
	"window.scrollTo(0, document.documentElement.scrollHeight)"
	)
	time.sleep(random.uniform(2, 3))

	if new_tweets_found == 0:
	scroll_attempts += 1
	else:
	scroll_attempts = 0

	browser.close()

	return json.dumps(
	{
	"source": "Twitter",
	"query": query,
	"results": results,
	"total_found": len(results),
	"fetched_at": utc_now().isoformat(),
	},
	default=str,
	indent=2,
	)

	except Exception as e:
	logger.error(f"[TWITTER] {e}")
	return json.dumps({"error": str(e)}, default=str)


	# """
	# Twitter trending/search wrapper. For trending, call scrape_twitter_trending_srilanka().
	# For search, this will attempt Playwright fetch if available, else Nitter fallback.
	# """
	# try:
	# if query.strip().lower() in ("trending", "trends", "trending srilanka", "trending sri lanka"):
	# return json.dumps(scrape_twitter_trending_srilanka(use_playwright=use_playwright, storage_state_site=storage_state_site), default=str)

	# if use_playwright and PLAYWRIGHT_AVAILABLE:
	# storage_state = None
	# if storage_state_site:
	# storage_state = load_playwright_storage_state_path(storage_state_site)

	# search_url = f"https://twitter.com/search?q={quote_plus(query)}&src=typed_query"
	# try:
	# html = playwright_fetch_html_using_session(search_url, storage_state or "", headless=True)
	# if html:
	# items = _simple_parse_posts_from_html(html, "https://twitter.com", max_items=20)
	# return json.dumps({"source": "twitter_playwright", "results": items}, default=str)
	# except Exception as e:
	# logger.debug(f"[TWITTER] Playwright search failed: {e}")

	# nitter = "https://nitter.net"
	# search_url = f"{nitter}/search?f=tweets&q={quote_plus(query)}"
	# resp = _safe_get(search_url)
	# if not resp:
	# return json.dumps({"error": "Could not fetch Twitter via Playwright or Nitter fallback"})
	# soup = BeautifulSoup(resp.text, "html.parser")
	# items = []
	# for a in soup.select("div.timeline-item"):
	# t = a.get_text(separator=" ", strip=True)
	# link = a.find("a", href=True)
	# href = _make_absolute(link["href"], nitter) if link else None
	# items.append({"text": t[:400], "url": href})
	# return json.dumps({"source": "nitter", "results": items[:20]}, default=str)
	# except Exception as e:
	# return json.dumps({"error": str(e)})


	def clean_linkedin_text(text):
	if not text:
	return ""

	# Remove "…see more" and "See translation"
	text = re.sub(r"…\s*see more", "", text, flags=re.IGNORECASE)
	text = re.sub(r"See translation", "", text, flags=re.IGNORECASE)
	text = re.sub(r"\b\d+[dwmo]\s•\s(Edited)?\s*•?", "", text)
	text = re.sub(r".+posted this", "", text)
	text = re.sub(r"\d+[\.,]?\d\sreactions", "", text)
	text = "\n".join([line.strip() for line in text.splitlines() if line.strip()])

	return text.strip()


	# =====================================================
	# FACEBOOK & INSTAGRAM UTILITY FUNCTIONS
	# =====================================================


	def clean_fb_text(text):
	"""Clean Facebook noisy text"""
	if not text:
	return ""

	text = re.sub(r"\b(?:[a-zA-Z]\s+){4,}\b", "", text)
	text = re.sub(r"(Facebook\s*){2,}", "", text)
	text = re.sub(r"Like\sComment\sShare", "", text)
	text = re.sub(r"All reactions:\s\d+\s", "", text)
	text = re.sub(r"\n\d+\n", "\n", text)
	text = "\n".join([line.strip() for line in text.splitlines() if line.strip()])

	return text.strip()


	def extract_media_id_instagram(page):
	"""Extract Instagram media ID"""
	html = page.content()
	match = re.search(r'"media_id":"(\d+)"', html)
	if match:
	return match.group(1)
	match = re.search(r'"id":"(\d+_\d+)"', html)
	if match:
	return match.group(1)
	return None


	def fetch_caption_via_private_api(page, media_id):
	"""Instagram Private API Caption fetch"""
	if not media_id:
	return None

	api_url = f"https://i.instagram.com/api/v1/media/{media_id}/info/"

	try:
	response = page.request.get(
	api_url,
	headers={
	"User-Agent": (
	"Instagram 290.0.0.0.66 (iPhone14,5; iOS 17_0; en_US) "
	"AppleWebKit/605.1.15"
	),
	"X-IG-App-ID": "936619743392459",
	},
	timeout=20000,
	)
	if response.status != 200:
	return None

	data = response.json()
	if "items" in data and data["items"]:
	return data["items"][0].get("caption", {}).get("text")
	except:
	pass

	return None


	@tool
	def scrape_instagram(keywords: Optional[List[str]] = None, max_items: int = 15):
	"""
	Instagram scraper using Playwright session.
	Scrapes posts from hashtag search and extracts captions.
	"""
	ensure_playwright()

	# Load Session
	site = "instagram"
	session_path = load_playwright_storage_state_path(
	site, out_dir="src/utils/.sessions"
	)
	if not session_path:
	session_path = load_playwright_storage_state_path(site, out_dir=".sessions")

	# Check for alternative session file name
	if not session_path:
	alt_paths = [
	os.path.join(os.getcwd(), "src", "utils", ".sessions", "ig_state.json"),
	os.path.join(os.getcwd(), ".sessions", "ig_state.json"),
	os.path.join(os.getcwd(), "ig_state.json"),
	]
	for path in alt_paths:
	if os.path.exists(path):
	session_path = path
	logger.info(f"[INSTAGRAM] Found session at {path}")
	break

	if not session_path:
	return json.dumps(
	{
	"error": "No Instagram session found",
	"solution": "Run the Instagram session manager to create a session",
	},
	default=str,
	)

	keyword = " ".join(keywords) if keywords else "srilanka"
	keyword = keyword.replace(" ", "") # Instagram hashtags don't have spaces
	results = []

	try:
	with sync_playwright() as p:
	instagram_mobile_ua = (
	"Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) "
	"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1"
	)

	browser = p.chromium.launch(headless=True)

	context = browser.new_context(
	storage_state=session_path,
	user_agent=instagram_mobile_ua,
	viewport={"width": 430, "height": 932},
	)

	page = context.new_page()
	url = f"https://www.instagram.com/explore/tags/{keyword}/"

	logger.info(f"[INSTAGRAM] Navigating to {url}")
	page.goto(url, timeout=120000)
	page.wait_for_timeout(4000)

	# Scroll to load posts
	for _ in range(12):
	page.mouse.wheel(0, 2500)
	page.wait_for_timeout(1500)

	# Collect post links
	anchors = page.locator("a[href='/p/'], a[href='/reel/']").all()
	links = []

	for a in anchors:
	href = a.get_attribute("href")
	if href:
	full = "https://www.instagram.com" + href
	links.append(full)
	if len(links) >= max_items:
	break

	logger.info(f"[INSTAGRAM] Found {len(links)} posts")

	# Extract captions from each post
	for link in links:
	logger.info(f"[INSTAGRAM] Scraping {link}")
	page.goto(link, timeout=120000)
	page.wait_for_timeout(2000)

	media_id = extract_media_id_instagram(page)
	caption = fetch_caption_via_private_api(page, media_id)

	# Fallback to direct extraction
	if not caption:
	try:
	caption = (
	page.locator("article h1, article span")
	.first.inner_text()
	.strip()
	)
	except:
	caption = None

	if caption:
	results.append(
	{
	"source": "Instagram",
	"text": caption,
	"url": link,
	"poster": "(Instagram User)",
	}
	)
	logger.info(
	f"[INSTAGRAM] Collected caption {len(results)}/{max_items}"
	)

	browser.close()

	return json.dumps(
	{
	"site": "Instagram",
	"results": results,
	"storage_state": session_path,
	},
	default=str,
	)

	except Exception as e:
	logger.error(f"[INSTAGRAM] {e}")
	return json.dumps({"error": str(e)}, default=str)


	@tool
	def scrape_facebook(keywords: Optional[List[str]] = None, max_items: int = 10):
	"""
	Facebook scraper using Playwright session (Desktop).
	Extracts posts from keyword search with poster names and text.
	"""
	ensure_playwright()

	# Load Session
	site = "facebook"
	session_path = load_playwright_storage_state_path(
	site, out_dir="src/utils/.sessions"
	)
	if not session_path:
	session_path = load_playwright_storage_state_path(site, out_dir=".sessions")

	# Check for alternative session file name
	if not session_path:
	alt_paths = [
	os.path.join(os.getcwd(), "src", "utils", ".sessions", "fb_state.json"),
	os.path.join(os.getcwd(), ".sessions", "fb_state.json"),
	os.path.join(os.getcwd(), "fb_state.json"),
	]
	for path in alt_paths:
	if os.path.exists(path):
	session_path = path
	logger.info(f"[FACEBOOK] Found session at {path}")
	break

	if not session_path:
	return json.dumps(
	{
	"error": "No Facebook session found",
	"solution": "Run the Facebook session manager to create a session",
	},
	default=str,
	)

	keyword = " ".join(keywords) if keywords else "Sri Lanka"
	results = []

	try:
	with sync_playwright() as p:
	facebook_desktop_ua = (
	"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
	"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
	)

	browser = p.chromium.launch(headless=True)

	context = browser.new_context(
	storage_state=session_path,
	user_agent=facebook_desktop_ua,
	viewport={"width": 1400, "height": 900},
	)

	page = context.new_page()

	search_url = f"https://www.facebook.com/search/posts?q={quote(keyword)}"

	logger.info(f"[FACEBOOK] Navigating to {search_url}")
	page.goto(search_url, timeout=120000)
	time.sleep(5)

	seen = set()
	stuck = 0
	last_scroll = 0

	MESSAGE_SELECTOR = "div[data-ad-preview='message']"

	# Poster selectors
	POSTER_SELECTORS = [
	"h3 strong a span",
	"h3 strong span",
	"h3 a span",
	"strong a span",
	"a[role='link'] span:not([class*='timestamp'])",
	"span.fwb a",
	"span.fwb",
	"a[aria-hidden='false'] span",
	"a[role='link'] span",
	]

	def extract_poster(post):
	"""Extract poster name from Facebook post"""
	parent = post.locator(
	"xpath=ancestor::div[contains(@class, 'x1yztbdb')][1]"
	)

	for selector in POSTER_SELECTORS:
	try:
	el = parent.locator(selector).first
	if el and el.count() > 0:
	name = el.inner_text().strip()
	if name and name != "Facebook" and len(name) > 1:
	return name
	except:
	pass

	return "(Unknown)"

	# IMPROVED: Expand ALL "See more" buttons on page before extracting
	def expand_all_see_more():
	"""Click all 'See more' buttons on the visible page"""
	see_more_selectors = [
	# Primary Facebook "See more" patterns
	"div[role='button'] span:text-is('See more')",
	"div[role='button']:has-text('See more')",
	"span:text-is('See more')",
	"span:text-is('... See more')",
	"span:text-is('...See more')",
	# Alternate patterns
	"[role='button']:has-text('See more')",
	"div.x1i10hfl:has-text('See more')",
	# Direct text match
	"text='See more'",
	"text='... See more'",
	]

	clicked = 0
	for selector in see_more_selectors:
	try:
	buttons = page.locator(selector).all()
	for btn in buttons:
	try:
	if btn.is_visible():
	btn.scroll_into_view_if_needed()
	time.sleep(0.2)
	btn.click(force=True)
	clicked += 1
	time.sleep(0.3)
	except:
	pass
	except:
	pass

	if clicked > 0:
	logger.info(f"[FACEBOOK] Expanded {clicked} 'See more' buttons")
	return clicked

	while len(results) < max_items:
	# First expand all "See more" on visible content
	expand_all_see_more()
	time.sleep(0.5)

	posts = page.locator(MESSAGE_SELECTOR).all()

	for post in posts:
	try:
	# Try to expand within this specific post container too
	try:
	post.scroll_into_view_if_needed()
	time.sleep(0.3)

	# Look for See more in parent container
	parent = post.locator(
	"xpath=ancestor::div[contains(@class, 'x1yztbdb')][1]"
	)

	post_see_more_selectors = [
	"div[role='button'] span:text-is('See more')",
	"span:text-is('See more')",
	"div[role='button']:has-text('See more')",
	]

	for selector in post_see_more_selectors:
	try:
	btns = parent.locator(selector)
	if btns.count() > 0 and btns.first.is_visible():
	btns.first.click(force=True)
	time.sleep(0.5)
	break
	except:
	pass
	except:
	pass

	raw = post.inner_text().strip()
	cleaned = clean_fb_text(raw)

	poster = extract_poster(post)

	if cleaned and len(cleaned) > 30:
	key = poster + "::" + cleaned
	if key not in seen:
	seen.add(key)
	results.append(
	{
	"source": "Facebook",
	"poster": poster,
	"text": cleaned,
	"url": "https://www.facebook.com",
	}
	)
	logger.info(
	f"[FACEBOOK] Collected post {len(results)}/{max_items}"
	)

	if len(results) >= max_items:
	break

	except:
	pass

	# Scroll
	page.evaluate("window.scrollBy(0, 2300)")
	time.sleep(1.2)

	new_scroll = page.evaluate("window.scrollY")
	stuck = stuck + 1 if new_scroll == last_scroll else 0
	last_scroll = new_scroll

	if stuck >= 3:
	logger.info("[FACEBOOK] Reached end of results")
	break

	browser.close()

	return json.dumps(
	{
	"site": "Facebook",
	"results": results[:max_items],
	"storage_state": session_path,
	},
	default=str,
	)

	except Exception as e:
	logger.error(f"[FACEBOOK] {e}")
	return json.dumps({"error": str(e)}, default=str)


	@tool
	def scrape_government_gazette(
	keywords: Optional[List[str]] = None, max_items: int = 15
	):
	"""
	Search and scrape Sri Lankan government gazette entries from gazette.lk.
	This tool visits each gazette page to extract full descriptions and download links (PDFs).
	"""
	data = scrape_government_gazette_impl(keywords=keywords, max_items=max_items)
	return json.dumps(data, default=str)


	def clean_linkedin_text(text):
	if not text:
	return ""

	# Remove "…see more" and "See translation"
	text = re.sub(r"…\s*see more", "", text, flags=re.IGNORECASE)
	text = re.sub(r"See translation", "", text, flags=re.IGNORECASE)
	text = re.sub(r"\b\d+[dwmo]\s•\s(Edited)?\s*•?", "", text)
	text = re.sub(r".+posted this", "", text)
	text = re.sub(r"\d+[\.,]?\d\sreactions", "", text)
	text = "\n".join([line.strip() for line in text.splitlines() if line.strip()])

	return text.strip()


	@tool
	def scrape_parliament_minutes(
	keywords: Optional[List[str]] = None, max_items: int = 20
	):
	"""
	Search and scrape Sri Lankan Parliament Hansards and minutes matching keywords.
	"""
	data = scrape_parliament_minutes_impl(keywords=keywords, max_items=max_items)
	return json.dumps(data, default=str)


	def clean_linkedin_text(text):
	if not text:
	return ""

	# Remove "…see more" and "See translation"
	text = re.sub(r"…\s*see more", "", text, flags=re.IGNORECASE)
	text = re.sub(r"See translation", "", text, flags=re.IGNORECASE)
	text = re.sub(r"\b\d+[dwmo]\s•\s(Edited)?\s*•?", "", text)
	text = re.sub(r".+posted this", "", text)
	text = re.sub(r"\d+[\.,]?\d\sreactions", "", text)
	text = "\n".join([line.strip() for line in text.splitlines() if line.strip()])

	return text.strip()


	@tool
	def scrape_train_schedule(
	from_station: Optional[str] = None,
	to_station: Optional[str] = None,
	keyword: Optional[str] = None,
	max_items: int = 30,
	):
	"""
	Scrape Sri Lanka Railways train schedule based on stations or keywords.
	"""
	data = scrape_train_schedule_impl(
	from_station=from_station,
	to_station=to_station,
	keyword=keyword,
	max_items=max_items,
	)
	return json.dumps(data, default=str)


	def clean_linkedin_text(text):
	if not text:
	return ""

	# Remove "…see more" and "See translation"
	text = re.sub(r"…\s*see more", "", text, flags=re.IGNORECASE)
	text = re.sub(r"See translation", "", text, flags=re.IGNORECASE)
	text = re.sub(r"\b\d+[dwmo]\s•\s(Edited)?\s*•?", "", text)
	text = re.sub(r".+posted this", "", text)
	text = re.sub(r"\d+[\.,]?\d\sreactions", "", text)
	text = "\n".join([line.strip() for line in text.splitlines() if line.strip()])

	return text.strip()


	@tool
	def scrape_cse_stock_data(
	symbol: str = "ASPI", period: str = "1d", interval: str = "1h"
	):
	"""
	Scrape Colombo Stock Exchange (CSE) data for a given symbol (e.g., ASPI).
	Tries yfinance first, then falls back to direct site scraping.
	"""
	data = scrape_cse_stock_impl(symbol=symbol, period=period, interval=interval)
	return json.dumps(data, default=str)


	def clean_linkedin_text(text):
	if not text:
	return ""

	# Remove "…see more" and "See translation"
	text = re.sub(r"…\s*see more", "", text, flags=re.IGNORECASE)
	text = re.sub(r"See translation", "", text, flags=re.IGNORECASE)
	text = re.sub(r"\b\d+[dwmo]\s•\s(Edited)?\s*•?", "", text)
	text = re.sub(r".+posted this", "", text)
	text = re.sub(r"\d+[\.,]?\d\sreactions", "", text)
	text = "\n".join([line.strip() for line in text.splitlines() if line.strip()])

	return text.strip()


	@tool
	def scrape_local_news(keywords: Optional[List[str]] = None, max_articles: int = 30):
	"""
	Scrape major Sri Lankan local news websites (Daily Mirror, Daily FT, etc.) for articles matching keywords.
	"""
	data = scrape_local_news_impl(keywords=keywords, max_articles=max_articles)
	return json.dumps(data, default=str)


	def clean_linkedin_text(text):
	if not text:
	return ""

	# Remove "…see more" and "See translation"
	text = re.sub(r"…\s*see more", "", text, flags=re.IGNORECASE)
	text = re.sub(r"See translation", "", text, flags=re.IGNORECASE)
	text = re.sub(r"\b\d+[dwmo]\s•\s(Edited)?\s*•?", "", text)
	text = re.sub(r".+posted this", "", text)
	text = re.sub(r"\d+[\.,]?\d\sreactions", "", text)
	text = "\n".join([line.strip() for line in text.splitlines() if line.strip()])

	return text.strip()


	@tool
	def think_tool(reflection: str) -> str:
	"""
	Log a thought or reflection from the agent. Useful for debugging or tracing the agent's reasoning.
	"""
	return f"Reflection recorded: {reflection}"


	# =====================================================
	# FACEBOOK & INSTAGRAM UTILITY FUNCTIONS
	# =====================================================


	def clean_fb_text(text):
	"""Clean Facebook noisy text"""
	if not text:
	return ""

	text = re.sub(r"\b(?:[a-zA-Z]\s+){4,}\b", "", text)
	text = re.sub(r"(Facebook\s*){2,}", "", text)
	text = re.sub(r"Like\sComment\sShare", "", text)
	text = re.sub(r"All reactions:\s\d+\s", "", text)
	text = re.sub(r"\n\d+\n", "\n", text)
	text = "\n".join([line.strip() for line in text.splitlines() if line.strip()])

	return text.strip()


	def extract_media_id_instagram(page):
	"""Extract Instagram media ID"""
	html = page.content()
	match = re.search(r'"media_id":"(\d+)"', html)
	if match:
	return match.group(1)
	match = re.search(r'"id":"(\d+_\d+)"', html)
	if match:
	return match.group(1)
	return None


	def fetch_caption_via_private_api(page, media_id):
	"""Instagram Private API Caption fetch"""
	if not media_id:
	return None

	api_url = f"https://i.instagram.com/api/v1/media/{media_id}/info/"

	try:
	response = page.request.get(
	api_url,
	headers={
	"User-Agent": (
	"Instagram 290.0.0.0.66 (iPhone14,5; iOS 17_0; en_US) "
	"AppleWebKit/605.1.15"
	),
	"X-IG-App-ID": "936619743392459",
	},
	timeout=20000,
	)
	if response.status != 200:
	return None

	data = response.json()
	if "items" in data and data["items"]:
	return data["items"][0].get("caption", {}).get("text")
	except:
	pass

	return None


	@tool
	def scrape_instagram(keywords: Optional[List[str]] = None, max_items: int = 15):
	"""
	Instagram scraper using Playwright session.
	Scrapes posts from hashtag search and extracts captions.
	"""
	ensure_playwright()

	# Load Session
	site = "instagram"
	session_path = load_playwright_storage_state_path(
	site, out_dir="src/utils/.sessions"
	)
	if not session_path:
	session_path = load_playwright_storage_state_path(site, out_dir=".sessions")

	# Check for alternative session file name
	if not session_path:
	alt_paths = [
	os.path.join(os.getcwd(), "src", "utils", ".sessions", "ig_state.json"),
	os.path.join(os.getcwd(), ".sessions", "ig_state.json"),
	os.path.join(os.getcwd(), "ig_state.json"),
	]
	for path in alt_paths:
	if os.path.exists(path):
	session_path = path
	logger.info(f"[INSTAGRAM] Found session at {path}")
	break

	if not session_path:
	return json.dumps(
	{
	"error": "No Instagram session found",
	"solution": "Run the Instagram session manager to create a session",
	},
	default=str,
	)

	keyword = " ".join(keywords) if keywords else "srilanka"
	keyword = keyword.replace(" ", "") # Instagram hashtags don't have spaces
	results = []

	try:
	with sync_playwright() as p:
	instagram_mobile_ua = (
	"Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) "
	"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1"
	)

	browser = p.chromium.launch(headless=True)

	context = browser.new_context(
	storage_state=session_path,
	user_agent=instagram_mobile_ua,
	viewport={"width": 430, "height": 932},
	)

	page = context.new_page()
	url = f"https://www.instagram.com/explore/tags/{keyword}/"

	logger.info(f"[INSTAGRAM] Navigating to {url}")
	page.goto(url, timeout=120000)
	page.wait_for_timeout(4000)

	# Scroll to load posts
	for _ in range(12):
	page.mouse.wheel(0, 2500)
	page.wait_for_timeout(1500)

	# Collect post links
	anchors = page.locator("a[href='/p/'], a[href='/reel/']").all()
	links = []

	for a in anchors:
	href = a.get_attribute("href")
	if href:
	full = "https://www.instagram.com" + href
	links.append(full)
	if len(links) >= max_items:
	break

	logger.info(f"[INSTAGRAM] Found {len(links)} posts")

	# Extract captions from each post
	for link in links:
	logger.info(f"[INSTAGRAM] Scraping {link}")
	page.goto(link, timeout=120000)
	page.wait_for_timeout(2000)

	media_id = extract_media_id_instagram(page)
	caption = fetch_caption_via_private_api(page, media_id)

	# Fallback to direct extraction
	if not caption:
	try:
	caption = (
	page.locator("article h1, article span")
	.first.inner_text()
	.strip()
	)
	except:
	caption = None

	if caption:
	results.append(
	{
	"source": "Instagram",
	"text": caption,
	"url": link,
	"poster": "(Instagram User)",
	}
	)
	logger.info(
	f"[INSTAGRAM] Collected caption {len(results)}/{max_items}"
	)

	browser.close()

	return json.dumps(
	{
	"site": "Instagram",
	"results": results,
	"storage_state": session_path,
	},
	default=str,
	)

	except Exception as e:
	logger.error(f"[INSTAGRAM] {e}")
	return json.dumps({"error": str(e)}, default=str)


	@tool
	def scrape_facebook(keywords: Optional[List[str]] = None, max_items: int = 10):
	"""
	Facebook scraper using Playwright session (Desktop).
	Extracts posts from keyword search with poster names and text.
	"""
	ensure_playwright()

	# Load Session
	site = "facebook"
	session_path = load_playwright_storage_state_path(
	site, out_dir="src/utils/.sessions"
	)
	if not session_path:
	session_path = load_playwright_storage_state_path(site, out_dir=".sessions")

	# Check for alternative session file name
	if not session_path:
	alt_paths = [
	os.path.join(os.getcwd(), "src", "utils", ".sessions", "fb_state.json"),
	os.path.join(os.getcwd(), ".sessions", "fb_state.json"),
	os.path.join(os.getcwd(), "fb_state.json"),
	]
	for path in alt_paths:
	if os.path.exists(path):
	session_path = path
	logger.info(f"[FACEBOOK] Found session at {path}")
	break

	if not session_path:
	return json.dumps(
	{
	"error": "No Facebook session found",
	"solution": "Run the Facebook session manager to create a session",
	},
	default=str,
	)

	keyword = " ".join(keywords) if keywords else "Sri Lanka"
	results = []

	try:
	with sync_playwright() as p:
	facebook_desktop_ua = (
	"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
	"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
	)

	browser = p.chromium.launch(headless=True)

	context = browser.new_context(
	storage_state=session_path,
	user_agent=facebook_desktop_ua,
	viewport={"width": 1400, "height": 900},
	)

	page = context.new_page()
	search_url = (
	f"https://www.facebook.com/search/posts?q={keyword.replace(' ', '%20')}"
	)

	logger.info(f"[FACEBOOK] Navigating to {search_url}")
	page.goto(search_url, timeout=120000)
	time.sleep(5)

	seen = set()
	stuck = 0
	last_scroll = 0

	MESSAGE_SELECTOR = "div[data-ad-preview='message']"

	# Poster selectors
	POSTER_SELECTORS = [
	"h3 strong a span",
	"h3 strong span",
	"h3 a span",
	"strong a span",
	"a[role='link'] span:not([class*='timestamp'])",
	"span.fwb a",
	"span.fwb",
	"a[aria-hidden='false'] span",
	"a[role='link'] span",
	]

	def extract_poster(post):
	"""Extract poster name from Facebook post"""
	parent = post.locator(
	"xpath=ancestor::div[contains(@class, 'x1yztbdb')][1]"
	)

	for selector in POSTER_SELECTORS:
	try:
	el = parent.locator(selector).first
	if el and el.count() > 0:
	name = el.inner_text().strip()
	if name and name != "Facebook" and len(name) > 1:
	return name
	except:
	pass

	return "(Unknown)"

	while len(results) < max_items:
	posts = page.locator(MESSAGE_SELECTOR).all()

	for post in posts:
	try:
	raw = post.inner_text().strip()
	cleaned = clean_fb_text(raw)

	poster = extract_poster(post)

	if cleaned and len(cleaned) > 30:
	key = poster + "::" + cleaned
	if key not in seen:
	seen.add(key)
	results.append(
	{
	"source": "Facebook",
	"poster": poster,
	"text": cleaned,
	"url": "https://www.facebook.com",
	}
	)
	logger.info(
	f"[FACEBOOK] Collected post {len(results)}/{max_items}"
	)

	if len(results) >= max_items:
	break

	except:
	pass

	# Scroll
	page.evaluate("window.scrollBy(0, 2300)")
	time.sleep(1.2)

	new_scroll = page.evaluate("window.scrollY")
	stuck = stuck + 1 if new_scroll == last_scroll else 0
	last_scroll = new_scroll

	if stuck >= 3:
	logger.info("[FACEBOOK] Reached end of results")
	break

	browser.close()

	return json.dumps(
	{
	"site": "Facebook",
	"results": results[:max_items],
	"storage_state": session_path,
	},
	default=str,
	)

	except Exception as e:
	logger.error(f"[FACEBOOK] {e}")
	return json.dumps({"error": str(e)}, default=str)


	@tool
	def scrape_reddit(
	keywords: List[str], limit: int = 20, subreddit: Optional[str] = None
	):
	"""
	Scrape Reddit for posts matching specific keywords.
	Optionally restrict to a specific subreddit.
	"""
	data = scrape_reddit_impl(keywords=keywords, limit=limit, subreddit=subreddit)
	return json.dumps(data, default=str)


	# ============================================
	# TOOL REGISTRY & EXPORTS
	# ============================================

	TOOL_MAPPING = {
	"scrape_linkedin": scrape_linkedin,
	"scrape_instagram": scrape_instagram,
	"scrape_facebook": scrape_facebook,
	"scrape_reddit": scrape_reddit,
	"scrape_twitter": scrape_twitter,
	"scrape_government_gazette": scrape_government_gazette,
	"scrape_parliament_minutes": scrape_parliament_minutes,
	"scrape_train_schedule": scrape_train_schedule,
	"scrape_cse_stock_data": scrape_cse_stock_data,
	"scrape_local_news": scrape_local_news,
	"think_tool": think_tool,
	}

	# Import and add profile scrapers for competitive intelligence
	try:
	from src.utils.profile_scrapers import (
	scrape_twitter_profile,
	scrape_facebook_profile,
	scrape_instagram_profile,
	scrape_linkedin_profile,
	scrape_product_reviews,
	)

	TOOL_MAPPING["scrape_twitter_profile"] = scrape_twitter_profile
	TOOL_MAPPING["scrape_facebook_profile"] = scrape_facebook_profile
	TOOL_MAPPING["scrape_instagram_profile"] = scrape_instagram_profile
	TOOL_MAPPING["scrape_linkedin_profile"] = scrape_linkedin_profile
	TOOL_MAPPING["scrape_product_reviews"] = scrape_product_reviews
	print("[OK] Profile scrapers loaded for Intelligence Agent")
	except ImportError as e:
	print(f"[WARN] Profile scrapers not available: {e}")


	ALL_TOOLS = list(TOOL_MAPPING.values())

	__all__ = [
	"get_today_str",
	"tool_dmc_alerts",
	"tool_weather_nowcast",
	"TOOL_MAPPING",
	"ALL_TOOLS",
	"create_or_restore_playwright_session",
	"playwright_fetch_html_using_session",
	]