Final_Assignment_Template

Sleeping

App Files Files Community

mhamzaanjum380 commited on Jun 30, 2025

Commit

aee413d

verified ·

1 Parent(s): d6187cd

Upload 2 files

Browse files

Files changed (2) hide show

explore_metadata.ipynb +601 -0
metadata.jsonl +0 -0

explore_metadata.ipynb ADDED Viewed

	@@ -0,0 +1,601 @@

+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "source": [
+        "# Load metadata.jsonl\n",
+        "import json\n",
+        "# Load the metadata.jsonl file\n",
+        "with open('metadata.jsonl', 'r') as jsonl_file:\n",
+        "    json_list = list(jsonl_file)"
+      ],
+      "metadata": {
+        "id": "jErfXbqHx1T3"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "type(json_list)"
+      ],
+      "metadata": {
+        "id": "RCcbpQD3x1Pp"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "json_QA = []\n",
+        "for json_str in json_list:\n",
+        "    json_data = json.loads(json_str)\n",
+        "    json_QA.append(json_data)"
+      ],
+      "metadata": {
+        "id": "F-6MzF9Zx1LR"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "json_QA[0]"
+      ],
+      "metadata": {
+        "id": "guJYoExXx1Fv"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import random\n",
+        "\n",
+        "random_samples = random.sample(json_QA, 1)\n",
+        "for sample in random_samples:\n",
+        "    print(\"=\" * 75)\n",
+        "    print(f\"Task ID: {sample['task_id']}\")\n",
+        "    print(f\"Question: {sample['Question']}\")\n",
+        "    print(f\"Level: {sample['Level']}\")\n",
+        "    print(f\"Final Answer: {sample['Final answer']}\")\n",
+        "    print(f\"Annotator Metadata: \")\n",
+        "    print(f\"  ├── Steps: \")\n",
+        "    for step in sample['Annotator Metadata']['Steps'].split('\\n'):\n",
+        "        print(f\"  │      ├── {step}\")\n",
+        "    print(f\"  ├── Number of steps: {sample['Annotator Metadata']['Number of steps']}\")\n",
+        "    print(f\"  ├── How long did this take?: {sample['Annotator Metadata']['How long did this take?']}\")\n",
+        "    print(f\"  ├── Tools:\")\n",
+        "    for tool in sample['Annotator Metadata']['Tools'].split('\\n'):\n",
+        "        print(f\"  │      ├── {tool}\")\n",
+        "    print(f\"  └── Number of tools: {sample['Annotator Metadata']['Number of tools']}\")\n",
+        "print(\"=\" * 75)"
+      ],
+      "metadata": {
+        "id": "9lHV1amUx1A4"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import os\n",
+        "from dotenv import load_dotenv\n",
+        "from langchain_huggingface import HuggingFaceEmbeddings\n",
+        "from langchain_community.vectorstores import FAISS"
+      ],
+      "metadata": {
+        "id": "A5EaWko_x086"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-mpnet-base-v2\")"
+      ],
+      "metadata": {
+        "id": "pNY9Q1egx04l"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from langchain.schema import Document\n",
+        "\n",
+        "docs = []\n",
+        "for sample in json_QA:\n",
+        "    content = f\"Question : {sample['Question']}\\n\\nFinal answer : {sample['Final answer']}\"\n",
+        "    doc = Document(\n",
+        "        page_content=content,\n",
+        "        metadata={\n",
+        "            \"source\": sample['task_id']\n",
+        "        }\n",
+        "    )\n",
+        "    docs.append(doc)"
+      ],
+      "metadata": {
+        "id": "iZLSkNl_x00a"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "docs[2]"
+      ],
+      "metadata": {
+        "id": "gOWff8jhB9RT"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "db = FAISS.from_documents(documents=docs, embedding=embeddings)"
+      ],
+      "metadata": {
+        "id": "7GUXBcBQx0qk"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "db.save_local(\"qa_index\")"
+      ],
+      "metadata": {
+        "id": "o8wwBjRw5mXL"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "folder_path = \"qa_index\"\n",
+        "# It is loaded in new_db\n",
+        "new_db = FAISS.load_local(folder_path, embeddings=embeddings, allow_dangerous_deserialization=True)"
+      ],
+      "metadata": {
+        "id": "IzUkiQIjx0nF"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "retriever = new_db.as_retriever()"
+      ],
+      "metadata": {
+        "id": "PqHNQ5DX96OF"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "query = \"On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?\"\n",
+        "docs = retriever.invoke(query)\n",
+        "docs[0]"
+      ],
+      "metadata": {
+        "id": "hTpWn1hXx0j3"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# list of the tools used in all the samples\n",
+        "from collections import Counter, OrderedDict\n",
+        "\n",
+        "tools = []\n",
+        "for sample in json_QA:\n",
+        "    for tool in sample['Annotator Metadata']['Tools'].split('\\n'):\n",
+        "        tool = tool[2:].strip().lower()\n",
+        "        if tool.startswith(\"(\"):\n",
+        "            tool = tool[11:].strip()\n",
+        "        tools.append(tool)\n",
+        "tools_counter = OrderedDict(Counter(tools))\n",
+        "print(\"List of tools used in all samples:\")\n",
+        "print(\"Total number of tools used:\", len(tools_counter))\n",
+        "for tool, count in tools_counter.items():\n",
+        "    print(f\"  ├── {tool}: {count}\")"
+      ],
+      "metadata": {
+        "id": "PXjVbAEQx0gU"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "system_prompt = \"\"\"\n",
+        "You are a helpful assistant tasked with answering questions using a set of tools.\n",
+        "If the tool is not available, you can try to find the information online. You can also use your own knowledge to answer the question.\n",
+        "You need to provide a step-by-step explanation of how you arrived at the answer.\n",
+        "==========================\n",
+        "Here is a few examples showing you how to answer the question step by step.\n",
+        "\"\"\"\n",
+        "\n",
+        "for i, samples in enumerate(random_samples):\n",
+        "    system_prompt += f\"\\nQuestion {i+1}: {samples['Question']}\\nSteps:\\n{samples['Annotator Metadata']['Steps']}\\nTools:\\n{samples['Annotator Metadata']['Tools']}\\nFinal Answer: {samples['Final answer']}\\n\"\n",
+        "system_prompt += \"\\n==========================\\n\"\n",
+        "system_prompt += \"Now, please answer the following question step by step.\\n\""
+      ],
+      "metadata": {
+        "id": "s_Nny7csx0cb"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# save the system_prompt to a file\n",
+        "with open('system_prompt.txt', 'w') as f:\n",
+        "    f.write(system_prompt)"
+      ],
+      "metadata": {
+        "id": "mgVVvO8zx0Yj"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# load the system prompt from the file\n",
+        "with open('system_prompt.txt', 'r') as f:\n",
+        "    system_prompt = f.read()\n",
+        "print(system_prompt)"
+      ],
+      "metadata": {
+        "id": "tGRnor1Ox0UZ"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Start building Agent"
+      ],
+      "metadata": {
+        "id": "_Dv0qdFZ_c8i"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from langgraph.graph import MessagesState, START, StateGraph\n",
+        "from langgraph.prebuilt import tools_condition\n",
+        "from langgraph.prebuilt import ToolNode\n",
+        "from langchain_google_genai import ChatGoogleGenerativeAI\n",
+        "from langchain_huggingface import HuggingFaceEmbeddings\n",
+        "from langchain_community.tools.tavily_search import TavilySearchResults\n",
+        "from langchain_community.document_loaders import WikipediaLoader\n",
+        "from langchain_community.document_loaders import ArxivLoader\n",
+        "from langchain_community.vectorstores import FAISS\n",
+        "from langchain.tools.retriever import create_retriever_tool\n",
+        "from langchain_core.messages import HumanMessage, SystemMessage\n",
+        "from langchain_core.tools import tool"
+      ],
+      "metadata": {
+        "id": "25fNKGasx0Qk"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-mpnet-base-v2\")"
+      ],
+      "metadata": {
+        "id": "sEmFrORkx0Mp"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "vector_store = new_db.as_retriever()"
+      ],
+      "metadata": {
+        "id": "DelgLC92x0JS"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "TkxZGipCxvsH"
+      },
+      "outputs": [],
+      "source": [
+        "question_retrieve_tool = create_retriever_tool(\n",
+        "    vector_store,\n",
+        "    \"Question Retriever\",\n",
+        "    \"Find similar questions in the vector database for the given question.\",\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "@tool\n",
+        "def multiply(a: int, b: int) -> int:\n",
+        "    \"\"\"Multiply two numbers.\n",
+        "\n",
+        "    Args:\n",
+        "        a: first int\n",
+        "        b: second int\n",
+        "    \"\"\"\n",
+        "    return a * b\n",
+        "\n",
+        "@tool\n",
+        "def add(a: int, b: int) -> int:\n",
+        "    \"\"\"Add two numbers.\n",
+        "\n",
+        "    Args:\n",
+        "        a: first int\n",
+        "        b: second int\n",
+        "    \"\"\"\n",
+        "    return a + b\n",
+        "\n",
+        "@tool\n",
+        "def subtract(a: int, b: int) -> int:\n",
+        "    \"\"\"Subtract two numbers.\n",
+        "\n",
+        "    Args:\n",
+        "        a: first int\n",
+        "        b: second int\n",
+        "    \"\"\"\n",
+        "    return a - b\n",
+        "\n",
+        "@tool\n",
+        "def divide(a: int, b: int) -> int:\n",
+        "    \"\"\"Divide two numbers.\n",
+        "\n",
+        "    Args:\n",
+        "        a: first int\n",
+        "        b: second int\n",
+        "    \"\"\"\n",
+        "    if b == 0:\n",
+        "        raise ValueError(\"Cannot divide by zero.\")\n",
+        "    return a / b\n",
+        "\n",
+        "@tool\n",
+        "def modulus(a: int, b: int) -> int:\n",
+        "    \"\"\"Get the modulus of two numbers.\n",
+        "\n",
+        "    Args:\n",
+        "        a: first int\n",
+        "        b: second int\n",
+        "    \"\"\"\n",
+        "    return a % b\n",
+        "\n",
+        "@tool\n",
+        "def wiki_search(query: str) -> str:\n",
+        "    \"\"\"Search Wikipedia for a query and return maximum 2 results.\n",
+        "\n",
+        "    Args:\n",
+        "        query: The search query.\"\"\"\n",
+        "    search_docs = WikipediaLoader(query=query, load_max_docs=2).load()\n",
+        "    formatted_search_docs = \"\\n\\n---\\n\\n\".join(\n",
+        "        [\n",
+        "            f'<Document source=\"{doc.metadata[\"source\"]}\" page=\"{doc.metadata.get(\"page\", \"\")}\"/>\\n{doc.page_content}\\n</Document>'\n",
+        "            for doc in search_docs\n",
+        "        ])\n",
+        "    return {\"wiki_results\": formatted_search_docs}\n",
+        "\n",
+        "@tool\n",
+        "def web_search(query: str) -> str:\n",
+        "    \"\"\"Search Tavily for a query and return maximum 3 results.\n",
+        "\n",
+        "    Args:\n",
+        "        query: The search query.\"\"\"\n",
+        "    search_docs = TavilySearchResults(max_results=3).invoke(query=query)\n",
+        "    formatted_search_docs = \"\\n\\n---\\n\\n\".join(\n",
+        "        [\n",
+        "            f'<Document source=\"{doc.metadata[\"source\"]}\" page=\"{doc.metadata.get(\"page\", \"\")}\"/>\\n{doc.page_content}\\n</Document>'\n",
+        "            for doc in search_docs\n",
+        "        ])\n",
+        "    return {\"web_results\": formatted_search_docs}\n",
+        "\n",
+        "@tool\n",
+        "def arvix_search(query: str) -> str:\n",
+        "    \"\"\"Search Arxiv for a query and return maximum 3 result.\n",
+        "\n",
+        "    Args:\n",
+        "        query: The search query.\"\"\"\n",
+        "    search_docs = ArxivLoader(query=query, load_max_docs=3).load()\n",
+        "    formatted_search_docs = \"\\n\\n---\\n\\n\".join(\n",
+        "        [\n",
+        "            f'<Document source=\"{doc.metadata[\"source\"]}\" page=\"{doc.metadata.get(\"page\", \"\")}\"/>\\n{doc.page_content[:1000]}\\n</Document>'\n",
+        "            for doc in search_docs\n",
+        "        ])\n",
+        "    return {\"arvix_results\": formatted_search_docs}\n",
+        "\n",
+        "@tool\n",
+        "def similar_question_search(question: str) -> str:\n",
+        "    \"\"\"Search the vector database for similar questions and return the first results.\n",
+        "\n",
+        "    Args:\n",
+        "        question: the question human provided.\"\"\"\n",
+        "    matched_docs = vector_store.similarity_search(query, 3)\n",
+        "    formatted_search_docs = \"\\n\\n---\\n\\n\".join(\n",
+        "        [\n",
+        "            f'<Document source=\"{doc.metadata[\"source\"]}\" page=\"{doc.metadata.get(\"page\", \"\")}\"/>\\n{doc.page_content[:1000]}\\n</Document>'\n",
+        "            for doc in matched_docs\n",
+        "        ])\n",
+        "    return {\"similar_questions\": formatted_search_docs}\n"
+      ],
+      "metadata": {
+        "id": "fjaTIMVwFQJX"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "tools = [\n",
+        "    multiply,\n",
+        "    add,\n",
+        "    subtract,\n",
+        "    divide,\n",
+        "    modulus,\n",
+        "    wiki_search,\n",
+        "    web_search,\n",
+        "    arvix_search,\n",
+        "    similar_question_search,\n",
+        "    question_retrieve_tool\n",
+        "]"
+      ],
+      "metadata": {
+        "id": "9NVPKEV0GAFi"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "K9zA9G1uqBGj"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "llm = ChatGoogleGenerativeAI(model=\"gemini-2.0-flash\")\n",
+        "llm_with_tools = llm.bind_tools(tools)"
+      ],
+      "metadata": {
+        "id": "qas0W-ImGBte"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# load the system prompt from the file\n",
+        "with open('system_prompt.txt', 'r') as f:\n",
+        "    system_prompt = f.read()\n",
+        "\n",
+        "\n",
+        "# System message\n",
+        "sys_msg = SystemMessage(content=system_prompt)"
+      ],
+      "metadata": {
+        "id": "wVmI8Rf5GBpb"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Node\n",
+        "def assistant(state: MessagesState):\n",
+        "    \"\"\"Assistant node\"\"\"\n",
+        "    return {\"messages\": [llm_with_tools.invoke([sys_msg] + state[\"messages\"])]}\n",
+        "\n",
+        "# Build graph\n",
+        "builder = StateGraph(MessagesState)\n",
+        "builder.add_node(\"assistant\", assistant)\n",
+        "builder.add_node(\"tools\", ToolNode(tools))\n",
+        "builder.add_edge(START, \"assistant\")\n",
+        "builder.add_conditional_edges(\n",
+        "    \"assistant\",\n",
+        "    tools_condition,\n",
+        ")\n",
+        "builder.add_edge(\"tools\", \"assistant\")\n",
+        "\n",
+        "# Compile graph\n",
+        "graph = builder.compile()"
+      ],
+      "metadata": {
+        "id": "gBXKT6YtGBkU"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from IPython.display import Image, display\n",
+        "\n",
+        "display(Image(graph.get_graph(xray=True).draw_mermaid_png()))"
+      ],
+      "metadata": {
+        "id": "Clsd8J7fGBfl"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "question = \"\"\n",
+        "messages = [HumanMessage(content=question)]\n",
+        "messages = graph.invoke({\"messages\": messages})"
+      ],
+      "metadata": {
+        "id": "G0tvlcKnGBbr"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "for m in messages['messages']:\n",
+        "    m.pretty_print()"
+      ],
+      "metadata": {
+        "id": "uIpDcVbjG-hN"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}

metadata.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff