Spaces:

aursalan
/

latch_jobs

Running

App Files Files Community

aursalan commited on 16 days ago

Commit

61cfbd5

1 Parent(s): 94c36ed

Added changes

Browse files

Files changed (1) hide show

main.py +55 -59

main.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import psycopg2
-from psycopg2.extras import execute_values
 import pandas as pd
 from sentence_transformers import SentenceTransformer
 import os
@@ -10,9 +10,9 @@ from fastapi import FastAPI, BackgroundTasks, HTTPException
 from contextlib import asynccontextmanager
 from fastapi.responses import HTMLResponse
 import threading
 # --- Configuration ---
-# You can set this via environment variable, or keep the hardcoded string here.
 SUPABASE_CONNECTION_STRING = os.getenv("SUPABASE_CONNECTION_STRING")
 # --- Toggles & Tuning ---
@@ -22,15 +22,15 @@ DRY_RUN = False
 # --- Global State ---
 model = None
-execution_logs = deque(maxlen=50) # Stores the last 50 batch logs in RAM
-processing_lock = threading.Lock() # Lock to prevent overlapping pings
-# --- Lifespan Manager (Loads Model on Startup) ---
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     global model
     print("⏳ Loading Model...")
-    # Load model once when the API starts
     model = SentenceTransformer('Alibaba-NLP/gte-modernbert-base', trust_remote_code=True)
     print("✅ Model Loaded.")
     yield
@@ -42,84 +42,89 @@ app = FastAPI(lifespan=lifespan)
 def fetch_and_lock_chunk(conn, chunk_size):
     """
-    Fetches the next batch of JOBS and LOCKS them
-    so other workers skip them (Concurrency Safe).
     """
     query = """
     WITH locked_jobs AS (
-        SELECT job_id, job_title, roles_and_responsibilities, qualification
         FROM jobs
         WHERE
-            -- Condition 1: Embedding is missing
-            job_embeddings IS NULL
             OR
-            -- Condition 2: Job details were updated more recently than the vector
-            updated_at > job_embeddings_updated_at
         LIMIT %s
-        FOR UPDATE SKIP LOCKED -- <--- Prevents conflicts
     )
-    SELECT
-        lj.job_id,
-        lj.job_title,
-        lj.roles_and_responsibilities,
-        lj.qualification,
-        -- 1. Skills (Subquery)
-        (SELECT json_agg(DISTINCT s.skill_name)
-         FROM job_skill_map jsm
-         JOIN skills s ON jsm.skill_id = s.skill_id
-         WHERE jsm.job_id = lj.job_id) AS skills
-    FROM locked_jobs lj;
     """
     return pd.read_sql_query(query, conn, params=(chunk_size,))
 def clean_and_format_text(row):
     """
-    Joins lists into a single string with Semantic Anchors (Tags).
     """
     # Configuration: Maps DB Column -> Semantic Tag
     field_config = [
-        ('job_title',                  'Job Title'),
         ('roles_and_responsibilities', 'Responsibilities'),
-        ('qualification',              'Qualifications'),
-        ('skills',                     'Required Skills')
     ]
     text_parts = []
     for col_name, tag in field_config:
-        if col_name in row:
             data = row[col_name]
-            # Case A: List of strings (Skills)
-            if isinstance(data, list) and len(data) > 0:
-                clean_items = [str(item).strip() for item in data if item is not None and str(item).strip()]
                 if clean_items:
                     text_parts.append(f"{tag}: " + ", ".join(clean_items))
-            # Case B: Single String (Title, Description)
-            elif isinstance(data, str) and data.strip():
-                # Clean up newlines in description to avoid messy formatting
-                clean_text = data.strip().replace('\r', '')
                 text_parts.append(f"{tag}: {clean_text}")
     return "\n".join(text_parts)
 def update_db_batch(conn, updates):
     if DRY_RUN: return
     query = """
         UPDATE jobs AS j
-        SET job_embeddings = data.vector::vector,
-            job_embeddings_updated_at = NOW()
         FROM (VALUES %s) AS data (id, vector)
-        WHERE j.job_id = data.id
     """
     cursor = conn.cursor()
     try:
         execute_values(cursor, query, updates)
-        conn.commit() # Releases locks
     except Exception as e:
         conn.rollback()
         raise e
@@ -130,14 +135,14 @@ def run_worker_logic():
     """
     The core logic that runs one single batch processing for JOBS.
     """
-    log_buffer = [] # Local buffer to capture logs for this specific run
     timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
     log_buffer.append(f"<b>BATCH RUN: {timestamp}</b>")
     conn = None
     try:
-        conn = psycopg2.connect(SUPABASE_CONNECTION_STRING, sslmode='require')
         # 1. Fetch & Lock
         df = fetch_and_lock_chunk(conn, PROCESSING_CHUNK_SIZE)
@@ -145,7 +150,6 @@ def run_worker_logic():
         if df.empty:
             conn.rollback()
             log_buffer.append("💤 No pending jobs found.")
-            # Add to global logs and exit
             execution_logs.appendleft("<br>".join(log_buffer))
             return "No data"
@@ -154,14 +158,15 @@ def run_worker_logic():
         # 2. Clean Text
         df['full_text'] = df.apply(clean_and_format_text, axis=1)
-        # 3. Log Inputs (For the Root API view)
         for index, row in df.iterrows():
             log_buffer.append(f"<div style='border:1px solid #ccc; margin:5px; padding:5px; background:#f9f9f9'>")
-            log_buffer.append(f"<strong>ID: {row['job_id']} ({row.get('job_title', 'Unknown')})</strong>")
-            log_buffer.append(f"<pre style='white-space: pre-wrap;'>{row['full_text']}</pre>")
             log_buffer.append("</div>")
         # 4. Generate Embeddings
         embeddings = model.encode(
             df['full_text'].tolist(),
             batch_size=EMBEDDING_BATCH_SIZE,
@@ -171,7 +176,7 @@ def run_worker_logic():
         )
         # 5. Update DB
-        updates = list(zip(df['job_id'].tolist(), embeddings.tolist()))
         if not DRY_RUN:
             update_db_batch(conn, updates)
@@ -186,16 +191,12 @@ def run_worker_logic():
         print(f"Error: {e}")
     finally:
         if conn: conn.close()
-        # Push the local buffer to the global execution log
         execution_logs.appendleft("<br>".join(log_buffer))
 # --- API Endpoints ---
 @app.get("/", response_class=HTMLResponse)
 async def read_root():
-    """
-    Root endpoint: Displays the logs of recent processing batches.
-    """
     html_content = """
     <html>
         <head>
@@ -223,18 +224,13 @@ async def read_root():
 @app.get("/trigger-batch")
 async def trigger_processing(background_tasks: BackgroundTasks):
-    """
-    External Pinger: Hits this endpoint to trigger one batch of processing.
-    """
     if processing_lock.locked():
         return {"status": "busy", "message": "Worker is currently processing a previous batch."}
-    # We run the worker in a background task so the API response is fast
     background_tasks.add_task(wrapped_worker)
     return {"status": "started", "message": "Batch processing started in background."}
 def wrapped_worker():
-    """Thread-safe wrapper for the worker logic"""
     if processing_lock.acquire(blocking=False):
         try:
             run_worker_logic()

 import psycopg2
+from psycopg2.extras import execute_values, Json
 import pandas as pd
 from sentence_transformers import SentenceTransformer
 import os
 from contextlib import asynccontextmanager
 from fastapi.responses import HTMLResponse
 import threading
+import json
 # --- Configuration ---
 SUPABASE_CONNECTION_STRING = os.getenv("SUPABASE_CONNECTION_STRING")
 # --- Toggles & Tuning ---
 # --- Global State ---
 model = None
+execution_logs = deque(maxlen=50)
+processing_lock = threading.Lock()
+# --- Lifespan Manager ---
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     global model
     print("⏳ Loading Model...")
+    # Using the Alibaba GTE ModernBERT as requested
     model = SentenceTransformer('Alibaba-NLP/gte-modernbert-base', trust_remote_code=True)
     print("✅ Model Loaded.")
     yield
 def fetch_and_lock_chunk(conn, chunk_size):
     """
+    Fetches the next batch of JOBS from the new denormalized schema
+    and LOCKS them using FOR UPDATE SKIP LOCKED.
     """
     query = """
     WITH locked_jobs AS (
+        SELECT
+            id,
+            title,
+            company_name,
+            location,
+            work_model,
+            employment_type,
+            roles_and_responsibilities,
+            qualification,
+            min_experience
         FROM jobs
         WHERE
+            -- Condition 1: Embedding is missing (New Job)
+            embeddings IS NULL
             OR
+            -- Condition 2: Job created after the last embedding (Retry/Update Logic)
+            -- Note: Since there is no 'updated_at' column, we rely on created_at vs embeddings_created_at
+            (embeddings_created_at IS NOT NULL AND created_at > embeddings_created_at)
         LIMIT %s
+        FOR UPDATE SKIP LOCKED
     )
+    SELECT * FROM locked_jobs;
     """
+    # pandas read_sql usually handles JSONB columns as standard Python objects (lists/dicts)
     return pd.read_sql_query(query, conn, params=(chunk_size,))
 def clean_and_format_text(row):
     """
+    Joins denormalized columns into a single semantic string for embedding.
     """
     # Configuration: Maps DB Column -> Semantic Tag
+    # (Column Name in DF, Label for Text)
     field_config = [
+        ('title',                  'Job Title'),
+        ('company_name',           'Company'),
+        ('location',               'Location'),
+        ('work_model',             'Work Model'),
+        ('min_experience',         'Minimum Experience (Years)'),
         ('roles_and_responsibilities', 'Responsibilities'),
+        ('qualification',          'Qualifications')
     ]
     text_parts = []
     for col_name, tag in field_config:
+        if col_name in row and row[col_name] is not None:
             data = row[col_name]
+            # Case A: JSONB List (Roles, Qualifications)
+            if isinstance(data, list):
+                # Filter out empty strings or None values
+                clean_items = [str(item).strip() for item in data if item and str(item).strip()]
                 if clean_items:
                     text_parts.append(f"{tag}: " + ", ".join(clean_items))
+            # Case B: Standard String/Int (Title, Company, Experience)
+            elif str(data).strip():
+                clean_text = str(data).strip().replace('\r', '')
                 text_parts.append(f"{tag}: {clean_text}")
+    # Combine all parts with newlines
     return "\n".join(text_parts)
 def update_db_batch(conn, updates):
     if DRY_RUN: return
+    # Update the 'embeddings' column and the 'embeddings_created_at' timestamp
     query = """
         UPDATE jobs AS j
+        SET embeddings = data.vector::vector,
+            embeddings_created_at = NOW()
         FROM (VALUES %s) AS data (id, vector)
+        WHERE j.id = data.id::uuid
     """
     cursor = conn.cursor()
     try:
         execute_values(cursor, query, updates)
+        conn.commit()
     except Exception as e:
         conn.rollback()
         raise e
     """
     The core logic that runs one single batch processing for JOBS.
     """
+    log_buffer = []
     timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
     log_buffer.append(f"<b>BATCH RUN: {timestamp}</b>")
     conn = None
     try:
+        conn = psycopg2.connect(SUPABASE_CONNECTION_STRING)
         # 1. Fetch & Lock
         df = fetch_and_lock_chunk(conn, PROCESSING_CHUNK_SIZE)
         if df.empty:
             conn.rollback()
             log_buffer.append("💤 No pending jobs found.")
             execution_logs.appendleft("<br>".join(log_buffer))
             return "No data"
         # 2. Clean Text
         df['full_text'] = df.apply(clean_and_format_text, axis=1)
+        # 3. Log Inputs (for debugging/visibility)
         for index, row in df.iterrows():
             log_buffer.append(f"<div style='border:1px solid #ccc; margin:5px; padding:5px; background:#f9f9f9'>")
+            log_buffer.append(f"<strong>ID: {row['id']} - {row.get('title', 'Unknown')}</strong>")
+            log_buffer.append(f"<pre style='white-space: pre-wrap; font-size: 0.8em;'>{row['full_text']}</pre>")
             log_buffer.append("</div>")
         # 4. Generate Embeddings
+        # Note: Ensure the model dimensions match your DB vector size (ModernBERT is typically 768)
         embeddings = model.encode(
             df['full_text'].tolist(),
             batch_size=EMBEDDING_BATCH_SIZE,
         )
         # 5. Update DB
+        updates = list(zip(df['id'].tolist(), embeddings.tolist()))
         if not DRY_RUN:
             update_db_batch(conn, updates)
         print(f"Error: {e}")
     finally:
         if conn: conn.close()
         execution_logs.appendleft("<br>".join(log_buffer))
 # --- API Endpoints ---
 @app.get("/", response_class=HTMLResponse)
 async def read_root():
     html_content = """
     <html>
         <head>
 @app.get("/trigger-batch")
 async def trigger_processing(background_tasks: BackgroundTasks):
     if processing_lock.locked():
         return {"status": "busy", "message": "Worker is currently processing a previous batch."}
     background_tasks.add_task(wrapped_worker)
     return {"status": "started", "message": "Batch processing started in background."}
 def wrapped_worker():
     if processing_lock.acquire(blocking=False):
         try:
             run_worker_logic()