Spaces:
Sleeping
Sleeping
| import os | |
| import json | |
| import tempfile | |
| from datetime import datetime | |
| from flask import Flask, render_template, request, jsonify, session, redirect, url_for | |
| import google.generativeai as genai | |
| from sentence_transformers import SentenceTransformer | |
| # Removed ChromaDB and added Qdrant | |
| from qdrant_client import QdrantClient | |
| from qdrant_client.models import VectorParams, Distance, Filter, FieldCondition, MatchValue, PointStruct, SearchParams | |
| # LangChain splitter | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| import arxiv | |
| import PyPDF2 | |
| from docx import Document | |
| import requests | |
| from werkzeug.utils import secure_filename | |
| from dotenv import load_dotenv | |
| import uuid | |
| import re | |
| from bs4 import BeautifulSoup | |
| import logging | |
| import numpy as np | |
| # Load environment variables | |
| load_dotenv() | |
| # Set up logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| app = Flask(__name__) | |
| app.secret_key = os.getenv('SECRET_KEY', 'research-radar-secret-key-2024') | |
| # Configuration | |
| UPLOAD_FOLDER = 'uploads' | |
| ALLOWED_EXTENSIONS = {'txt', 'pdf', 'docx'} | |
| MAX_CONTENT_LENGTH = 16 * 1024 * 1024 # 16MB max file size | |
| app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER | |
| app.config['MAX_CONTENT_LENGTH'] = MAX_CONTENT_LENGTH | |
| # Ensure directories exist | |
| os.makedirs(UPLOAD_FOLDER, exist_ok=True) | |
| # Initialize models and services | |
| try: | |
| # Configure Gemini API | |
| gemini_api_key = os.getenv('GEMINI_API_KEY') | |
| if gemini_api_key: | |
| genai.configure(api_key=gemini_api_key) | |
| gemini_model = genai.GenerativeModel('gemini-2.5-flash') | |
| logger.info("β Gemini API initialized successfully") | |
| else: | |
| gemini_model = None | |
| logger.warning("β οΈ Gemini API key not found. AI features will be limited.") | |
| # Initialize sentence transformer for embeddings (local model) | |
| from config import Config | |
| local_model_path = Config.LOCAL_MODEL_PATH | |
| if os.path.exists(local_model_path): | |
| embedding_model = SentenceTransformer(local_model_path) | |
| logger.info(f"β Local sentence transformer model loaded from: {local_model_path}") | |
| else: | |
| # Fallback to downloading if local model not found | |
| embedding_model = SentenceTransformer(Config.EMBEDDING_MODEL) | |
| logger.warning(f"β οΈ Local model not found at {local_model_path}, downloading {Config.EMBEDDING_MODEL} from HuggingFace") | |
| # Determine vector size dynamically | |
| try: | |
| _probe_vec = embedding_model.encode(["probe text"]) | |
| VECTOR_SIZE = int(_probe_vec.shape[-1]) if hasattr(_probe_vec, 'shape') else len(_probe_vec[0]) | |
| except Exception: | |
| VECTOR_SIZE = 384 # fallback for all-MiniLM-L6-v2 | |
| # Initialize Qdrant client | |
| qdrant_url = os.getenv('QDRANT_URL') | |
| qdrant_api_key = os.getenv('QDRANT_API_KEY') | |
| qdrant_client = QdrantClient(url=qdrant_url, api_key=qdrant_api_key, timeout=120) | |
| logger.info("β Qdrant client initialized") | |
| # Ensure default collection exists | |
| def ensure_qdrant_collection(collection_name: str, vector_size: int) -> None: | |
| try: | |
| qdrant_client.get_collection(collection_name) | |
| except Exception: | |
| qdrant_client.recreate_collection( | |
| collection_name=collection_name, | |
| vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE) | |
| ) | |
| logger.info(f"β Created Qdrant collection: {collection_name}") | |
| # Ensure payload index for document_id exists | |
| try: | |
| qdrant_client.create_payload_index( | |
| collection_name=collection_name, | |
| field_name="document_id", | |
| field_schema="keyword" | |
| ) | |
| logger.info("β Ensured payload index for 'document_id'") | |
| except Exception: | |
| # Likely already exists | |
| pass | |
| ensure_qdrant_collection('research_papers', VECTOR_SIZE) | |
| except Exception as e: | |
| logger.error(f"β Initialization error: {e}") | |
| embedding_model = None | |
| gemini_model = None | |
| qdrant_client = None | |
| VECTOR_SIZE = None | |
| def allowed_file(filename): | |
| """Check if file extension is allowed""" | |
| return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS | |
| def extract_text_from_pdf(file_path): | |
| """Extract text from PDF file""" | |
| try: | |
| with open(file_path, 'rb') as file: | |
| pdf_reader = PyPDF2.PdfReader(file) | |
| text = "" | |
| for page in pdf_reader.pages: | |
| text += page.extract_text() + "\n" | |
| return text | |
| except Exception as e: | |
| print(f"PDF extraction error: {e}") | |
| return "" | |
| def extract_text_from_docx(file_path): | |
| """Extract text from DOCX file""" | |
| try: | |
| doc = Document(file_path) | |
| text = "" | |
| for paragraph in doc.paragraphs: | |
| text += paragraph.text + "\n" | |
| return text | |
| except Exception as e: | |
| print(f"DOCX extraction error: {e}") | |
| return "" | |
| def extract_text_from_txt(file_path): | |
| """Extract text from TXT file""" | |
| try: | |
| with open(file_path, 'r', encoding='utf-8') as file: | |
| return file.read() | |
| except Exception as e: | |
| print(f"TXT extraction error: {e}") | |
| return "" | |
| def process_document(file_path, filename): | |
| """Process uploaded document and extract text""" | |
| file_extension = filename.rsplit('.', 1)[1].lower() | |
| if file_extension == 'pdf': | |
| return extract_text_from_pdf(file_path) | |
| elif file_extension == 'docx': | |
| return extract_text_from_docx(file_path) | |
| elif file_extension == 'txt': | |
| return extract_text_from_txt(file_path) | |
| else: | |
| return "" | |
| def search_arxiv_papers(query, max_results=10): | |
| """Search arXiv papers""" | |
| try: | |
| client = arxiv.Client() | |
| search = arxiv.Search( | |
| query=query, | |
| max_results=max_results, | |
| sort_by=arxiv.SortCriterion.Relevance | |
| ) | |
| papers = [] | |
| for result in client.results(search): | |
| paper = { | |
| 'title': result.title, | |
| 'authors': [author.name for author in result.authors], | |
| 'summary': result.summary, | |
| 'url': result.entry_id, | |
| 'pdf_url': result.pdf_url, | |
| 'published': result.published.strftime('%Y-%m-%d'), | |
| 'category': result.primary_category | |
| } | |
| papers.append(paper) | |
| return papers | |
| except Exception as e: | |
| print(f"arXiv search error: {e}") | |
| return [] | |
| def generate_summary(text, max_length=500): | |
| """Generate summary using Gemini API""" | |
| try: | |
| if not gemini_model: | |
| return "Summary generation unavailable - API not configured" | |
| prompt = f""" | |
| Please provide a comprehensive summary of this research paper/document in approximately {max_length} words. | |
| Focus on: | |
| 1. Main research question/objective | |
| 2. Key methodology | |
| 3. Important findings | |
| 4. Conclusions and implications | |
| Text to summarize: | |
| {text[:80000]} | |
| """ | |
| response = gemini_model.generate_content(prompt) | |
| return response.text | |
| except Exception as e: | |
| logger.error(f"Summary generation error: {e}") | |
| return "Error generating summary. Please try again." | |
| # Text chunking using LangChain | |
| def chunk_text(text: str, chunk_size: int = 1000, chunk_overlap: int = 200): | |
| splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=chunk_size, | |
| chunk_overlap=chunk_overlap, | |
| separators=["\n\n", "\n", " ", ""] | |
| ) | |
| return splitter.split_text(text) | |
| # Qdrant helpers | |
| def ensure_qdrant_collection(collection_name: str, vector_size: int) -> None: | |
| """Create Qdrant collection if it doesn't exist""" | |
| if not qdrant_client: | |
| return | |
| try: | |
| qdrant_client.get_collection(collection_name) | |
| except Exception: | |
| qdrant_client.recreate_collection( | |
| collection_name=collection_name, | |
| vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE) | |
| ) | |
| # Ensure payload index for document_id exists for efficient filtering/scrolling | |
| try: | |
| qdrant_client.create_payload_index( | |
| collection_name=collection_name, | |
| field_name="document_id", | |
| field_schema="keyword" | |
| ) | |
| except Exception: | |
| pass | |
| def add_document_to_vector_db(text, metadata, doc_id, collection_name="research_papers"): | |
| """Add chunked document vectors to Qdrant for chat functionality""" | |
| try: | |
| if not embedding_model or not qdrant_client or not VECTOR_SIZE: | |
| return False | |
| ensure_qdrant_collection(collection_name, VECTOR_SIZE) | |
| # Split text using recursive text splitter | |
| chunks = chunk_text(text, chunk_size=1200, chunk_overlap=250) | |
| if not chunks: | |
| return False | |
| embeddings = embedding_model.encode(chunks) | |
| vectors = embeddings.tolist() if hasattr(embeddings, 'tolist') else embeddings | |
| points = [] | |
| for i, (chunk, vector) in enumerate(zip(chunks, vectors)): | |
| payload = dict(metadata or {}) | |
| payload.update({ | |
| 'document_id': doc_id, | |
| 'chunk_index': i, | |
| 'total_chunks': len(chunks), | |
| 'content': chunk, | |
| }) | |
| points.append( | |
| PointStruct( | |
| id=str(uuid.uuid4()), | |
| vector=vector, | |
| payload=payload | |
| ) | |
| ) | |
| qdrant_client.upsert(collection_name=collection_name, points=points, wait=True) | |
| return True | |
| except Exception as e: | |
| print(f"Vector DB error: {e}") | |
| return False | |
| def query_vector_db(query, doc_id, collection_name="research_papers", n_results=3): | |
| """Query Qdrant for similar chunks for the given document_id""" | |
| try: | |
| if not embedding_model or not qdrant_client or not VECTOR_SIZE: | |
| return [] | |
| ensure_qdrant_collection(collection_name, VECTOR_SIZE) | |
| query_embedding = embedding_model.encode([query]) | |
| query_vector = query_embedding[0].tolist() if hasattr(query_embedding, 'tolist') else list(query_embedding[0]) | |
| flt = Filter(must=[FieldCondition(key="document_id", match=MatchValue(value=doc_id))]) | |
| results = qdrant_client.search( | |
| collection_name=collection_name, | |
| query_vector=query_vector, | |
| limit=n_results, | |
| query_filter=flt, | |
| with_payload=True, | |
| with_vectors=False | |
| ) | |
| documents = [] | |
| for r in results or []: | |
| payload = getattr(r, 'payload', None) or {} | |
| documents.append(payload.get('content', '')) | |
| return {'documents': [documents]} | |
| except Exception as e: | |
| print(f"Vector DB query error: {e}") | |
| return [] | |
| def get_all_chunks_for_document(doc_id: str, collection_name: str = "research_papers"): | |
| """Retrieve all chunks for a document from Qdrant, ordered by chunk_index""" | |
| try: | |
| all_points = [] | |
| next_offset = None | |
| flt = Filter(must=[FieldCondition(key="document_id", match=MatchValue(value=doc_id))]) | |
| while True: | |
| points, next_offset = qdrant_client.scroll( | |
| collection_name=collection_name, | |
| scroll_filter=flt, | |
| limit=500, | |
| offset=next_offset, | |
| with_payload=True, | |
| with_vectors=False | |
| ) | |
| all_points.extend(points) | |
| if not next_offset: | |
| break | |
| # Order by chunk_index | |
| all_points.sort(key=lambda p: p.payload.get('chunk_index', 0)) | |
| return [p.payload.get('content', '') for p in all_points] | |
| except Exception as e: | |
| print(f"Qdrant scroll error: {e}") | |
| return [] | |
| def get_all_documents(collection_name: str = "research_papers"): | |
| """Get all unique documents from Qdrant with their metadata""" | |
| try: | |
| if not qdrant_client: | |
| return [] | |
| # Get all points to extract unique documents | |
| all_points = [] | |
| next_offset = None | |
| while True: | |
| points, next_offset = qdrant_client.scroll( | |
| collection_name=collection_name, | |
| limit=1000, | |
| offset=next_offset, | |
| with_payload=True, | |
| with_vectors=False | |
| ) | |
| all_points.extend(points) | |
| if not next_offset: | |
| break | |
| # Group by document_id and extract metadata | |
| documents = {} | |
| for point in all_points: | |
| payload = point.payload or {} | |
| doc_id = payload.get('document_id') | |
| if not doc_id: | |
| continue | |
| if doc_id not in documents: | |
| # Create document metadata from first chunk | |
| doc_type = payload.get('type', 'document') | |
| # Generate proper title based on type | |
| title = payload.get('title', 'Untitled Document') | |
| if doc_type == 'arxiv_paper' and payload.get('pdf_url'): | |
| # Extract arXiv ID from URL for better title | |
| pdf_url = payload.get('pdf_url', '') | |
| if 'arxiv.org/pdf/' in pdf_url: | |
| arxiv_id = pdf_url.split('/')[-1].replace('.pdf', '') | |
| title = f"arXiv:{arxiv_id}" | |
| elif 'arxiv.org/abs/' in pdf_url: | |
| arxiv_id = pdf_url.split('/')[-1] | |
| title = f"arXiv:{arxiv_id}" | |
| elif doc_type == 'uploaded_document' and payload.get('filename'): | |
| title = payload.get('filename') | |
| documents[doc_id] = { | |
| 'document_id': doc_id, | |
| 'title': title, | |
| 'authors': payload.get('authors', ['Unknown']), | |
| 'published': payload.get('published', 'Unknown Date'), | |
| 'category': payload.get('category', 'Research'), | |
| 'filename': payload.get('filename', ''), | |
| 'pdf_url': payload.get('pdf_url', ''), | |
| 'type': doc_type, | |
| 'upload_date': payload.get('upload_date', ''), | |
| 'total_chunks': payload.get('total_chunks', 0), | |
| 'word_count': payload.get('word_count', 0) | |
| } | |
| # Convert to list and sort by upload date (newest first) | |
| doc_list = list(documents.values()) | |
| doc_list.sort(key=lambda x: x.get('upload_date', ''), reverse=True) | |
| return doc_list | |
| except Exception as e: | |
| print(f"Error getting documents: {e}") | |
| return [] | |
| def get_document_metadata(doc_id: str, collection_name: str = "research_papers"): | |
| """Get metadata for a specific document""" | |
| try: | |
| if not qdrant_client: | |
| return None | |
| # Get first chunk to extract metadata | |
| flt = Filter(must=[FieldCondition(key="document_id", match=MatchValue(value=doc_id))]) | |
| results = qdrant_client.scroll( | |
| collection_name=collection_name, | |
| scroll_filter=flt, | |
| limit=1, | |
| with_payload=True, | |
| with_vectors=False | |
| ) | |
| if results and results[0]: | |
| payload = results[0][0].payload or {} | |
| return { | |
| 'document_id': doc_id, | |
| 'title': payload.get('title', 'Untitled Document'), | |
| 'authors': payload.get('authors', ['Unknown']), | |
| 'published': payload.get('published', 'Unknown Date'), | |
| 'category': payload.get('category', 'Research'), | |
| 'filename': payload.get('filename', ''), | |
| 'pdf_url': payload.get('pdf_url', ''), | |
| 'type': payload.get('type', 'document'), | |
| 'upload_date': payload.get('upload_date', ''), | |
| 'total_chunks': payload.get('total_chunks', 0), | |
| 'word_count': payload.get('word_count', 0) | |
| } | |
| return None | |
| except Exception as e: | |
| print(f"Error getting document metadata: {e}") | |
| return None | |
| # Paper ingestion helpers | |
| def resolve_pdf_url(url_or_pdf: str) -> str: | |
| if not url_or_pdf: | |
| return '' | |
| if 'arxiv.org/pdf/' in url_or_pdf and url_or_pdf.endswith('.pdf'): | |
| return url_or_pdf | |
| # convert arXiv abs to pdf | |
| m = re.search(r"arxiv\.org/(abs|pdf)/([\w\.-]+)", url_or_pdf) | |
| if m: | |
| arxiv_id = m.group(2) | |
| if not arxiv_id.endswith('.pdf'): | |
| return f"https://arxiv.org/pdf/{arxiv_id}.pdf" | |
| return f"https://arxiv.org/pdf/{arxiv_id}" | |
| return url_or_pdf | |
| def download_pdf_to_temp(pdf_url: str) -> str: | |
| r = requests.get(pdf_url, stream=True, timeout=30) | |
| r.raise_for_status() | |
| with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp: | |
| for chunk in r.iter_content(chunk_size=8192): | |
| if chunk: | |
| tmp.write(chunk) | |
| return tmp.name | |
| def ingest_paper(pdf_url: str, paper_meta: dict = None) -> tuple: | |
| """Download PDF, extract text, chunk, embed and store in Qdrant. Returns (doc_id, word_count).""" | |
| pdf_url = resolve_pdf_url(pdf_url) | |
| doc_id = str(uuid.uuid4()) | |
| tmp_path = None | |
| try: | |
| tmp_path = download_pdf_to_temp(pdf_url) | |
| text_content = extract_text_from_pdf(tmp_path) | |
| if not text_content.strip(): | |
| return None, 0 | |
| metadata = { | |
| 'source': 'arxiv', | |
| 'pdf_url': pdf_url, | |
| 'type': 'arxiv_paper' | |
| } | |
| if paper_meta: | |
| metadata.update(paper_meta) | |
| ok = add_document_to_vector_db(text_content, metadata, doc_id) | |
| if not ok: | |
| return None, 0 | |
| # set active document | |
| session['active_document_id'] = doc_id | |
| return doc_id, len(text_content.split()) | |
| finally: | |
| if tmp_path and os.path.exists(tmp_path): | |
| try: | |
| os.remove(tmp_path) | |
| except Exception: | |
| pass | |
| def generate_summary_from_qdrant(doc_id: str, max_chars: int = 80000) -> str: | |
| chunks = get_all_chunks_for_document(doc_id) | |
| if not chunks: | |
| return "No content available to summarize." | |
| # Concatenate up to max_chars | |
| full_text = '' | |
| for chunk in chunks: | |
| if len(full_text) + len(chunk) > max_chars: | |
| break | |
| full_text += (chunk + '\n') | |
| return generate_summary(full_text) | |
| def generate_chat_response(question, context_docs): | |
| """Generate chat response using Gemini with context""" | |
| try: | |
| if not gemini_model: | |
| return "Chat functionality unavailable - API not configured" | |
| context = "\n\n".join(context_docs) if context_docs else "" | |
| prompt = f""" | |
| You are a research assistant helping users understand academic papers. | |
| Answer the following question based on the provided context from research papers. | |
| If the context doesn't contain relevant information, say so politely and suggest what information would be needed. | |
| Context from research papers: | |
| {context} | |
| Question: {question} | |
| Please provide a clear, accurate, and helpful response. | |
| """ | |
| response = gemini_model.generate_content(prompt) | |
| return response.text | |
| except Exception as e: | |
| logger.error(f"Chat response error: {e}") | |
| return "Error generating response. Please try again." | |
| # Routes | |
| def index(): | |
| """Main page""" | |
| return render_template('index.html') | |
| def search_papers(): | |
| """Search arXiv papers""" | |
| try: | |
| data = request.get_json() | |
| query = data.get('query', '').strip() | |
| if not query: | |
| return jsonify({'error': 'Query is required'}), 400 | |
| papers = search_arxiv_papers(query, max_results=10) | |
| return jsonify({'papers': papers}) | |
| except Exception as e: | |
| return jsonify({'error': f'Search failed: {str(e)}'}), 500 | |
| def ingest_paper_endpoint(): | |
| """Ingest a paper PDF by URL: download, chunk, embed, store in Qdrant.""" | |
| try: | |
| data = request.get_json() | |
| pdf_url = data.get('pdf_url') or data.get('url') | |
| title = data.get('title') | |
| authors = data.get('authors') | |
| published = data.get('published') | |
| if not pdf_url: | |
| return jsonify({'error': 'pdf_url is required'}), 400 | |
| doc_id, word_count = ingest_paper(pdf_url, paper_meta={'title': title, 'authors': authors, 'published': published}) | |
| if not doc_id: | |
| return jsonify({'error': 'Failed to ingest paper'}), 500 | |
| return jsonify({'success': True, 'doc_id': doc_id, 'word_count': word_count}) | |
| except Exception as e: | |
| logger.error(f"Ingestion failed: {e}", exc_info=True) | |
| return jsonify({'error': f'Ingestion failed: {str(e)}'}), 500 | |
| def upload_file(): | |
| """Handle file upload""" | |
| try: | |
| if 'file' not in request.files: | |
| return jsonify({'error': 'No file selected'}), 400 | |
| file = request.files['file'] | |
| if file.filename == '': | |
| return jsonify({'error': 'No file selected'}), 400 | |
| if file and allowed_file(file.filename): | |
| filename = secure_filename(file.filename) | |
| # Generate a unique ID for this document session | |
| doc_id = str(uuid.uuid4()) | |
| # Use a temporary file to avoid cluttering the upload folder | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=f"_{filename}") as tmp_file: | |
| file.save(tmp_file.name) | |
| tmp_file_path = tmp_file.name | |
| # Extract text from document | |
| text_content = process_document(tmp_file_path, filename) | |
| # Clean up temporary file immediately | |
| os.remove(tmp_file_path) | |
| if not text_content.strip(): | |
| return jsonify({'error': 'Could not extract text from file'}), 400 | |
| # Generate summary | |
| summary = generate_summary(text_content) | |
| # Add to vector database for chat | |
| metadata = { | |
| 'filename': file.filename, | |
| 'upload_date': datetime.now().isoformat(), | |
| 'type': 'uploaded_document' | |
| } | |
| add_document_to_vector_db(text_content, metadata, doc_id) | |
| # Store the active document ID in the session | |
| session['active_document_id'] = doc_id | |
| return jsonify({ | |
| 'success': True, | |
| 'filename': file.filename, | |
| 'summary': summary, | |
| 'word_count': len(text_content.split()), | |
| 'doc_id': doc_id # Send doc_id to frontend | |
| }) | |
| return jsonify({'error': 'Invalid file type'}), 400 | |
| except Exception as e: | |
| logger.error(f"Upload failed: {e}", exc_info=True) | |
| return jsonify({'error': f'Upload failed: {str(e)}'}), 500 | |
| def summarize_paper(): | |
| """Summarize paper: if doc_id provided, summarize from Qdrant; else ingest then summarize.""" | |
| try: | |
| data = request.get_json() | |
| doc_id = data.get('doc_id') | |
| paper_url = data.get('url', '').strip() | |
| pdf_url = data.get('pdf_url') | |
| if not doc_id and not (paper_url or pdf_url): | |
| return jsonify({'error': 'doc_id or url/pdf_url is required'}), 400 | |
| # If doc_id not provided, ingest first | |
| paper_data = None | |
| if not doc_id: | |
| # If only abs URL provided, try resolve via arxiv client for metadata | |
| try: | |
| # Extract arXiv ID from URL | |
| arxiv_id = None | |
| if paper_url: | |
| arxiv_id = paper_url.split('/')[-1].replace('.pdf', '') | |
| if arxiv_id: | |
| client = arxiv.Client() | |
| search = arxiv.Search(id_list=[arxiv_id]) | |
| for result in client.results(search): | |
| paper_data = { | |
| 'title': result.title, | |
| 'authors': [author.name for author in result.authors], | |
| 'summary': result.summary, | |
| 'url': result.entry_id, | |
| 'pdf_url': result.pdf_url, | |
| 'published': result.published.strftime('%Y-%m-%d') | |
| } | |
| break | |
| except Exception: | |
| paper_data = None | |
| ingest_pdf = pdf_url or (paper_data['pdf_url'] if paper_data and paper_data.get('pdf_url') else resolve_pdf_url(paper_url)) | |
| new_doc_id, _ = ingest_paper(ingest_pdf, paper_meta=paper_data or {}) | |
| if not new_doc_id: | |
| return jsonify({'error': 'Failed to ingest paper'}), 500 | |
| doc_id = new_doc_id | |
| session['active_document_id'] = doc_id | |
| # Summarize from Qdrant chunks | |
| summary = generate_summary_from_qdrant(doc_id) | |
| return jsonify({ | |
| 'success': True, | |
| 'summary': summary, | |
| 'doc_id': doc_id, | |
| 'paper': paper_data | |
| }) | |
| except Exception as e: | |
| return jsonify({'error': f'Request failed: {str(e)}'}), 500 | |
| def chat(): | |
| """Handle chat queries for the active document""" | |
| try: | |
| data = request.get_json() | |
| # Accept both 'message' and 'question' for backward compatibility | |
| question = data.get('message', data.get('question', '')).strip() | |
| doc_id = session.get('active_document_id') | |
| if not question: | |
| return jsonify({'error': 'Message is required'}), 400 | |
| # If no active document, provide general help | |
| if not doc_id: | |
| if not gemini_model: | |
| return jsonify({'error': 'AI service is not available. Please check your API configuration.'}), 500 | |
| # Generate a general response without document context | |
| try: | |
| prompt = f""" | |
| You are a helpful AI research assistant for Research Radar. The user asked: "{question}" | |
| Since no document is currently loaded, provide a helpful response about: | |
| 1. How to use Research Radar (search papers, upload documents, chat features) | |
| 2. General research guidance if the question is research-related | |
| 3. Suggest they upload a document or search for papers to get more specific help | |
| Keep your response friendly and informative. | |
| """ | |
| response = gemini_model.generate_content(prompt) | |
| return jsonify({ | |
| 'success': True, | |
| 'response': response.text, | |
| 'context_found': False, | |
| 'no_document': True | |
| }) | |
| except Exception as e: | |
| return jsonify({ | |
| 'success': True, | |
| 'response': "Hello! I'm your AI research assistant. To get started, please upload a document or search for papers using the navigation above. Then I can help you analyze content, answer questions, and provide insights about your research materials.", | |
| 'context_found': False, | |
| 'no_document': True | |
| }) | |
| # Query vector database for relevant context from the active document | |
| search_results = query_vector_db(question, doc_id) | |
| context_docs = [] | |
| if search_results and isinstance(search_results, dict) and 'documents' in search_results: | |
| context_docs = search_results['documents'][0] | |
| # Generate response | |
| response = generate_chat_response(question, context_docs) | |
| return jsonify({ | |
| 'success': True, | |
| 'response': response, | |
| 'context_found': len(context_docs) > 0 | |
| }) | |
| except Exception as e: | |
| return jsonify({'error': f'Chat failed: {str(e)}'}), 500 | |
| def get_documents(): | |
| """Get all documents from the vector database""" | |
| try: | |
| documents = get_all_documents() | |
| return jsonify({'success': True, 'documents': documents}) | |
| except Exception as e: | |
| return jsonify({'error': f'Failed to get documents: {str(e)}'}), 500 | |
| def get_document(doc_id): | |
| """Get a specific document's metadata""" | |
| try: | |
| metadata = get_document_metadata(doc_id) | |
| if not metadata: | |
| return jsonify({'error': 'Document not found'}), 404 | |
| return jsonify({'success': True, 'document': metadata}) | |
| except Exception as e: | |
| return jsonify({'error': f'Failed to get document: {str(e)}'}), 500 | |
| def get_document_summary(doc_id): | |
| """Get summary for a specific document""" | |
| try: | |
| summary = generate_summary_from_qdrant(doc_id) | |
| metadata = get_document_metadata(doc_id) | |
| if not metadata: | |
| return jsonify({'error': 'Document not found'}), 404 | |
| return jsonify({ | |
| 'success': True, | |
| 'summary': summary, | |
| 'document': metadata | |
| }) | |
| except Exception as e: | |
| return jsonify({'error': f'Failed to get summary: {str(e)}'}), 500 | |
| def activate_document(doc_id): | |
| """Set a document as the active document for chat""" | |
| try: | |
| metadata = get_document_metadata(doc_id) | |
| if not metadata: | |
| return jsonify({'error': 'Document not found'}), 404 | |
| session['active_document_id'] = doc_id | |
| return jsonify({ | |
| 'success': True, | |
| 'message': 'Document activated', | |
| 'document': metadata | |
| }) | |
| except Exception as e: | |
| return jsonify({'error': f'Failed to activate document: {str(e)}'}), 500 | |
| def delete_document(doc_id): | |
| """Delete a document from Qdrant""" | |
| try: | |
| if not qdrant_client: | |
| return jsonify({'error': 'Vector database not available'}), 500 | |
| # Delete all points for this document | |
| flt = Filter(must=[FieldCondition(key="document_id", match=MatchValue(value=doc_id))]) | |
| qdrant_client.delete( | |
| collection_name="research_papers", | |
| points_selector=flt | |
| ) | |
| return jsonify({ | |
| 'success': True, | |
| 'message': 'Document deleted successfully' | |
| }) | |
| except Exception as e: | |
| return jsonify({'error': f'Failed to delete document: {str(e)}'}), 500 | |
| def clear_all_documents(): | |
| """Clear all documents from Qdrant""" | |
| try: | |
| if not qdrant_client: | |
| return jsonify({'error': 'Vector database not available'}), 500 | |
| # Delete all points | |
| qdrant_client.delete( | |
| collection_name="research_papers", | |
| points_selector=None | |
| ) | |
| return jsonify({ | |
| 'success': True, | |
| 'message': 'All documents cleared successfully' | |
| }) | |
| except Exception as e: | |
| return jsonify({'error': f'Failed to clear documents: {str(e)}'}), 500 | |
| def clear_session(): | |
| """Clear the active document from the session""" | |
| session.pop('active_document_id', None) | |
| return jsonify({'success': True, 'message': 'Session cleared.'}) | |
| def health_check(): | |
| """Health check endpoint""" | |
| return jsonify({ | |
| 'status': 'healthy', | |
| 'gemini_available': gemini_model is not None, | |
| 'embeddings_available': embedding_model is not None, | |
| 'vector_db_available': qdrant_client is not None | |
| }) | |
| if __name__ == '__main__': | |
| print("π Research Radar - Starting Flask Application...") | |
| print("π Features: arXiv search, document upload, AI summaries, chat functionality") | |
| print("π Make sure to set GEMINI_API_KEY in your .env file") | |
| print("π Using Qdrant as Vector DB. Ensure Qdrant is reachable via QDRANT_URL") | |
| # Get port from environment variable (for Hugging Face Spaces) | |
| port = int(os.environ.get('PORT', 5000)) | |
| debug = os.environ.get('FLASK_ENV') == 'development' | |
| print(f"π Access the app at: http://localhost:{port}") | |
| app.run(debug=debug, host='0.0.0.0', port=port) |