Spaces:

nivakaran
/

modelx

Running

File size: 10,700 Bytes

b4856f1
 
 
 
 
 
 
 
752f5cc
b4856f1
 
 
 
 
752f5cc
 
 
 
b4856f1
 
 
 
 
 
 
 
 
 
 
752f5cc
b4856f1
 
 
 
752f5cc
b4856f1
 
 
 
 
 
 
 
 
752f5cc
b4856f1
 
 
752f5cc
b4856f1
 
 
 
752f5cc
b4856f1
 
 
752f5cc
b4856f1
752f5cc
b4856f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
752f5cc
b4856f1
752f5cc
b4856f1
752f5cc
 
 
 
 
 
 
 
 
b4856f1
 
752f5cc
b4856f1
 
752f5cc
b4856f1
 
 
 
 
 
 
752f5cc
b4856f1
 
752f5cc
b4856f1
 
 
 
 
 
752f5cc
b4856f1
 
 
 
752f5cc
b4856f1
 
 
 
752f5cc
b4856f1
 
 
 
752f5cc
b4856f1
 
752f5cc
b4856f1
752f5cc
b4856f1
 
752f5cc
b4856f1
 
 
 
 
752f5cc
b4856f1
 
 
752f5cc
b4856f1
 
 
 
 
 
752f5cc
b4856f1
 
 
752f5cc
b4856f1
752f5cc
 
b4856f1
 
 
 
752f5cc
b4856f1
 
 
752f5cc
b4856f1
 
 
 
 
 
 
752f5cc
b4856f1
752f5cc
b4856f1
752f5cc
b4856f1
 
 
 
 
 
 
 
16ec2cf
b4856f1
 
 
 
752f5cc
b4856f1
752f5cc
b4856f1
752f5cc
 
b4856f1
 
 
 
752f5cc
b4856f1
 
 
752f5cc
b4856f1
 
 
 
 
 
752f5cc
b4856f1
752f5cc
b4856f1
752f5cc
 
 
 
 
 
b4856f1
752f5cc
16ec2cf
b4856f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16ec2cf
b4856f1
 
16ec2cf
b4856f1
752f5cc
b4856f1
752f5cc
b4856f1
 
752f5cc
 
 
 
 
 
 
 
 
b4856f1
 
 
752f5cc
b4856f1
 
752f5cc
b4856f1
 
752f5cc
b4856f1
 
 
 
 
 
 
 
 
 
 
 
752f5cc
 
 
 
 
b4856f1
 
 
 
752f5cc
b4856f1
 
752f5cc
b4856f1
752f5cc

"""
src/nodes/dataRetrievalAgentNode.py
COMPLETE - Data Retrieval Agent Node Implementation
Handles orchestrator-worker pattern for scraping tasks

Updated: Uses Tool Factory pattern for parallel execution safety.
Each agent instance gets its own private set of tools.
"""

import json
import uuid
from typing import List
from langchain_core.messages import HumanMessage, SystemMessage
from src.states.dataRetrievalAgentState import (
    DataRetrievalAgentState,
    ScrapingTask,
    RawScrapedData,
    ClassifiedEvent,
)
from src.utils.tool_factory import create_tool_set


class DataRetrievalAgentNode:
    """
    Implements the Data Retrieval Agent workflow:
    1. Master Agent - Plans scraping tasks
    2. Worker Agent - Executes individual tasks
    3. Tool Node - Runs the actual tools
    4. Classifier Agent - Categorizes results for domain agents

    Thread Safety:
        Each DataRetrievalAgentNode instance creates its own private ToolSet,
        enabling safe parallel execution with other agents.
    """

    def __init__(self, llm):
        """Initialize with LLM and private tool set"""
        # Create PRIVATE tool instances for this agent
        self.tools = create_tool_set()
        self.llm = llm

    # =========================================================================
    # 1. MASTER AGENT (TASK DELEGATOR)
    # =========================================================================

    def master_agent_node(self, state: DataRetrievalAgentState):
        """
        TASK DELEGATOR MASTER AGENT

        Decides which scraping tools to run based on:
        - Previously completed tasks (avoid redundancy)
        - Current monitoring needs
        - Keywords of interest

        Returns: List[ScrapingTask]
        """
        print("=== [MASTER AGENT] Planning Scraping Tasks ===")

        completed_tools = [r.source_tool for r in state.worker_results]

        system_prompt = f"""
You are the Master Data Retrieval Agent for Roger - Sri Lanka's situational awareness platform.

AVAILABLE TOOLS: {list(self.tools.as_dict().keys())}

Your job:
1. Decide which tools to run to keep the system updated
2. Avoid re-running tools just executed: {completed_tools}
3. Prioritize a mix of:
   - Official sources: scrape_government_gazette, scrape_parliament_minutes, scrape_train_schedule
   - Market data: scrape_cse_stock_data, scrape_local_news
   - Social media: scrape_reddit, scrape_twitter, scrape_facebook

Focus on Sri Lankan context with keywords like:
- "election", "policy", "budget", "strike", "inflation"
- "fuel", "railway", "protest", "flood", "gazette"

Previously planned: {state.previous_tasks}

Respond with valid JSON array:
[
  {{
    "tool_name": "<tool_name>",
    "parameters": {{"keywords": [...]}},
    "priority": "high" | "normal"
  }},
  ...
]

If no tasks needed, return []
"""

        parsed_tasks: List[ScrapingTask] = []

        try:
            response = self.llm.invoke(
                [
                    SystemMessage(content=system_prompt),
                    HumanMessage(
                        content="Plan the next scraping wave for Sri Lankan situational awareness."
                    ),
                ]
            )

            raw = response.content
            suggested = json.loads(raw)

            if isinstance(suggested, dict):
                suggested = [suggested]

            for item in suggested:
                try:
                    task = ScrapingTask(**item)
                    parsed_tasks.append(task)
                except Exception as e:
                    print(f"[MASTER] Failed to parse task: {e}")
                    continue

        except Exception as e:
            print(f"[MASTER] LLM planning failed: {e}, using fallback plan")

        # Fallback plan if LLM fails
        if not parsed_tasks and not state.previous_tasks:
            parsed_tasks = [
                ScrapingTask(
                    tool_name="scrape_local_news",
                    parameters={"keywords": ["Sri Lanka", "economy", "politics"]},
                    priority="high",
                ),
                ScrapingTask(
                    tool_name="scrape_cse_stock_data",
                    parameters={"symbol": "ASPI"},
                    priority="high",
                ),
                ScrapingTask(
                    tool_name="scrape_government_gazette",
                    parameters={"keywords": ["tax", "import", "regulation"]},
                    priority="normal",
                ),
                ScrapingTask(
                    tool_name="scrape_reddit",
                    parameters={"keywords": ["Sri Lanka"], "limit": 20},
                    priority="normal",
                ),
            ]

        print(f"[MASTER] Planned {len(parsed_tasks)} tasks")

        return {
            "generated_tasks": parsed_tasks,
            "previous_tasks": [t.tool_name for t in parsed_tasks],
        }

    # =========================================================================
    # 2. WORKER AGENT
    # =========================================================================

    def worker_agent_node(self, state: DataRetrievalAgentState):
        """
        DATA RETRIEVAL WORKER AGENT

        Pops next task from queue and prepares it for ToolNode execution.
        This runs in parallel via map() in the graph.
        """
        if not state.generated_tasks:
            print("[WORKER] No tasks in queue")
            return {}

        # Pop first task (FIFO)
        current_task = state.generated_tasks[0]
        remaining = state.generated_tasks[1:]

        print(f"[WORKER] Dispatching -> {current_task.tool_name}")

        return {"generated_tasks": remaining, "current_task": current_task}

    # =========================================================================
    # 3. TOOL NODE
    # =========================================================================

    def tool_node(self, state: DataRetrievalAgentState):
        """
        TOOL NODE

        Executes the actual scraping tool specified by current_task.
        Handles errors gracefully and records results.
        """
        current_task = state.current_task
        if current_task is None:
            print("[TOOL NODE] No active task")
            return {}

        print(f"[TOOL NODE] Executing -> {current_task.tool_name}")

        tool_func = self.tools.get(current_task.tool_name)

        if tool_func is None:
            output = f"Tool '{current_task.tool_name}' not found in registry"
            status = "failed"
        else:
            try:
                # Invoke LangChain tool with parameters
                output = tool_func.invoke(current_task.parameters or {})
                status = "success"
                print("[TOOL NODE] ✓ Success")
            except Exception as e:
                output = f"Error: {str(e)}"
                status = "failed"
                print(f"[TOOL NODE] ✗ Failed: {e}")

        result = RawScrapedData(
            source_tool=current_task.tool_name, raw_content=str(output), status=status
        )

        return {"current_task": None, "worker_results": [result]}

    # =========================================================================
    # 4. CLASSIFIER AGENT
    # =========================================================================

    def classifier_agent_node(self, state: DataRetrievalAgentState):
        """
        DATA CLASSIFIER AGENT

        Analyzes scraped data and routes it to appropriate domain agents.
        Creates ClassifiedEvent objects with summaries and target agents.
        """
        if not state.latest_worker_results:
            print("[CLASSIFIER] No new results to process")
            return {}

        print(f"[CLASSIFIER] Processing {len(state.latest_worker_results)} results")

        agent_categories = [
            "social",
            "economical",
            "political",
            "mobility",
            "weather",
            "intelligence",
        ]

        system_prompt = """
You are a data classification expert for Roger.

AVAILABLE AGENTS:
- social: Social media sentiment, public discussions
- economical: Stock market, economic indicators, CSE data
- political: Government gazette, parliament, regulations
- mobility: Transportation, train schedules, logistics
- weather: Meteorological data, disaster alerts
- intelligence: Brand monitoring, entity tracking

Task: Analyze the scraped data and:
1. Write a one-sentence summary
2. Choose the most appropriate agent

Respond with JSON:
{
  "summary": "<brief summary>",
  "target_agent": "<agent_name>"
}
"""

        all_classified: List[ClassifiedEvent] = []

        for result in state.latest_worker_results:
            try:
                response = self.llm.invoke(
                    [
                        SystemMessage(content=system_prompt),
                        HumanMessage(
                            content=f"Source: {result.source_tool}\n\nData:\n{result.raw_content[:2000]}"
                        ),
                    ]
                )

                result_json = json.loads(response.content)
                summary = result_json.get("summary", "No summary")
                target = result_json.get("target_agent", "social")

                if target not in agent_categories:
                    target = "social"

            except Exception as e:
                print(f"[CLASSIFIER] LLM failed: {e}, using rule-based classification")

                # Fallback rule-based classification
                source = result.source_tool.lower()
                if "stock" in source or "cse" in source:
                    target = "economical"
                elif "gazette" in source or "parliament" in source:
                    target = "political"
                elif "train" in source or "schedule" in source:
                    target = "mobility"
                elif any(s in source for s in ["reddit", "twitter", "facebook"]):
                    target = "social"
                else:
                    target = "social"

                summary = (
                    f"Data from {result.source_tool}: {result.raw_content[:150]}..."
                )

            classified = ClassifiedEvent(
                event_id=str(uuid.uuid4()),
                content_summary=summary,
                target_agent=target,
                confidence_score=0.85,
            )
            all_classified.append(classified)

        print(f"[CLASSIFIER] Classified {len(all_classified)} events")

        return {"classified_buffer": all_classified, "latest_worker_results": []}