Spaces:

VinitT
/

ResearchRadar.AI

Sleeping

ResearchRadar.AI / app.py

“vinit5112”

convert to local embedding model

fef4a8f 4 months ago

33.3 kB

	import os
	import json
	import tempfile
	from datetime import datetime
	from flask import Flask, render_template, request, jsonify, session, redirect, url_for
	import google.generativeai as genai
	from sentence_transformers import SentenceTransformer
	# Removed ChromaDB and added Qdrant
	from qdrant_client import QdrantClient
	from qdrant_client.models import VectorParams, Distance, Filter, FieldCondition, MatchValue, PointStruct, SearchParams
	# LangChain splitter
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	import arxiv
	import PyPDF2
	from docx import Document
	import requests
	from werkzeug.utils import secure_filename
	from dotenv import load_dotenv
	import uuid
	import re
	from bs4 import BeautifulSoup
	import logging
	import numpy as np

	# Load environment variables
	load_dotenv()

	# Set up logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	app = Flask(__name__)
	app.secret_key = os.getenv('SECRET_KEY', 'research-radar-secret-key-2024')

	# Configuration
	UPLOAD_FOLDER = 'uploads'
	ALLOWED_EXTENSIONS = {'txt', 'pdf', 'docx'}
	MAX_CONTENT_LENGTH = 16 * 1024 * 1024 # 16MB max file size

	app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
	app.config['MAX_CONTENT_LENGTH'] = MAX_CONTENT_LENGTH

	# Ensure directories exist
	os.makedirs(UPLOAD_FOLDER, exist_ok=True)

	# Initialize models and services
	try:
	# Configure Gemini API
	gemini_api_key = os.getenv('GEMINI_API_KEY')
	if gemini_api_key:
	genai.configure(api_key=gemini_api_key)
	gemini_model = genai.GenerativeModel('gemini-2.5-flash')
	logger.info("✅ Gemini API initialized successfully")
	else:
	gemini_model = None
	logger.warning("⚠️ Gemini API key not found. AI features will be limited.")

	# Initialize sentence transformer for embeddings (local model)
	from config import Config
	local_model_path = Config.LOCAL_MODEL_PATH

	if os.path.exists(local_model_path):
	embedding_model = SentenceTransformer(local_model_path)
	logger.info(f"✅ Local sentence transformer model loaded from: {local_model_path}")
	else:
	# Fallback to downloading if local model not found
	embedding_model = SentenceTransformer(Config.EMBEDDING_MODEL)
	logger.warning(f"⚠️ Local model not found at {local_model_path}, downloading {Config.EMBEDDING_MODEL} from HuggingFace")

	# Determine vector size dynamically
	try:
	_probe_vec = embedding_model.encode(["probe text"])
	VECTOR_SIZE = int(_probe_vec.shape[-1]) if hasattr(_probe_vec, 'shape') else len(_probe_vec[0])
	except Exception:
	VECTOR_SIZE = 384 # fallback for all-MiniLM-L6-v2

	# Initialize Qdrant client
	qdrant_url = os.getenv('QDRANT_URL')
	qdrant_api_key = os.getenv('QDRANT_API_KEY')
	qdrant_client = QdrantClient(url=qdrant_url, api_key=qdrant_api_key, timeout=120)

	logger.info("✅ Qdrant client initialized")

	# Ensure default collection exists
	def ensure_qdrant_collection(collection_name: str, vector_size: int) -> None:
	try:
	qdrant_client.get_collection(collection_name)
	except Exception:
	qdrant_client.recreate_collection(
	collection_name=collection_name,
	vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE)
	)
	logger.info(f"✅ Created Qdrant collection: {collection_name}")
	# Ensure payload index for document_id exists
	try:
	qdrant_client.create_payload_index(
	collection_name=collection_name,
	field_name="document_id",
	field_schema="keyword"
	)
	logger.info("✅ Ensured payload index for 'document_id'")
	except Exception:
	# Likely already exists
	pass

	ensure_qdrant_collection('research_papers', VECTOR_SIZE)

	except Exception as e:
	logger.error(f"❌ Initialization error: {e}")
	embedding_model = None
	gemini_model = None
	qdrant_client = None
	VECTOR_SIZE = None


	def allowed_file(filename):
	"""Check if file extension is allowed"""
	return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS


	def extract_text_from_pdf(file_path):
	"""Extract text from PDF file"""
	try:
	with open(file_path, 'rb') as file:
	pdf_reader = PyPDF2.PdfReader(file)
	text = ""
	for page in pdf_reader.pages:
	text += page.extract_text() + "\n"
	return text
	except Exception as e:
	print(f"PDF extraction error: {e}")
	return ""


	def extract_text_from_docx(file_path):
	"""Extract text from DOCX file"""
	try:
	doc = Document(file_path)
	text = ""
	for paragraph in doc.paragraphs:
	text += paragraph.text + "\n"
	return text
	except Exception as e:
	print(f"DOCX extraction error: {e}")
	return ""


	def extract_text_from_txt(file_path):
	"""Extract text from TXT file"""
	try:
	with open(file_path, 'r', encoding='utf-8') as file:
	return file.read()
	except Exception as e:
	print(f"TXT extraction error: {e}")
	return ""


	def process_document(file_path, filename):
	"""Process uploaded document and extract text"""
	file_extension = filename.rsplit('.', 1)[1].lower()

	if file_extension == 'pdf':
	return extract_text_from_pdf(file_path)
	elif file_extension == 'docx':
	return extract_text_from_docx(file_path)
	elif file_extension == 'txt':
	return extract_text_from_txt(file_path)
	else:
	return ""


	def search_arxiv_papers(query, max_results=10):
	"""Search arXiv papers"""
	try:
	client = arxiv.Client()
	search = arxiv.Search(
	query=query,
	max_results=max_results,
	sort_by=arxiv.SortCriterion.Relevance
	)

	papers = []
	for result in client.results(search):
	paper = {
	'title': result.title,
	'authors': [author.name for author in result.authors],
	'summary': result.summary,
	'url': result.entry_id,
	'pdf_url': result.pdf_url,
	'published': result.published.strftime('%Y-%m-%d'),
	'category': result.primary_category
	}
	papers.append(paper)

	return papers
	except Exception as e:
	print(f"arXiv search error: {e}")
	return []


	def generate_summary(text, max_length=500):
	"""Generate summary using Gemini API"""
	try:
	if not gemini_model:
	return "Summary generation unavailable - API not configured"

	prompt = f"""
	Please provide a comprehensive summary of this research paper/document in approximately {max_length} words.
	Focus on:
	1. Main research question/objective
	2. Key methodology
	3. Important findings
	4. Conclusions and implications

	Text to summarize:
	{text[:80000]}
	"""

	response = gemini_model.generate_content(prompt)
	return response.text
	except Exception as e:
	logger.error(f"Summary generation error: {e}")
	return "Error generating summary. Please try again."

	# Text chunking using LangChain

	def chunk_text(text: str, chunk_size: int = 1000, chunk_overlap: int = 200):
	splitter = RecursiveCharacterTextSplitter(
	chunk_size=chunk_size,
	chunk_overlap=chunk_overlap,
	separators=["\n\n", "\n", " ", ""]
	)
	return splitter.split_text(text)

	# Qdrant helpers

	def ensure_qdrant_collection(collection_name: str, vector_size: int) -> None:
	"""Create Qdrant collection if it doesn't exist"""
	if not qdrant_client:
	return
	try:
	qdrant_client.get_collection(collection_name)
	except Exception:
	qdrant_client.recreate_collection(
	collection_name=collection_name,
	vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE)
	)
	# Ensure payload index for document_id exists for efficient filtering/scrolling
	try:
	qdrant_client.create_payload_index(
	collection_name=collection_name,
	field_name="document_id",
	field_schema="keyword"
	)
	except Exception:
	pass


	def add_document_to_vector_db(text, metadata, doc_id, collection_name="research_papers"):
	"""Add chunked document vectors to Qdrant for chat functionality"""
	try:
	if not embedding_model or not qdrant_client or not VECTOR_SIZE:
	return False

	ensure_qdrant_collection(collection_name, VECTOR_SIZE)

	# Split text using recursive text splitter
	chunks = chunk_text(text, chunk_size=1200, chunk_overlap=250)
	if not chunks:
	return False

	embeddings = embedding_model.encode(chunks)
	vectors = embeddings.tolist() if hasattr(embeddings, 'tolist') else embeddings

	points = []
	for i, (chunk, vector) in enumerate(zip(chunks, vectors)):
	payload = dict(metadata or {})
	payload.update({
	'document_id': doc_id,
	'chunk_index': i,
	'total_chunks': len(chunks),
	'content': chunk,
	})
	points.append(
	PointStruct(
	id=str(uuid.uuid4()),
	vector=vector,
	payload=payload
	)
	)

	qdrant_client.upsert(collection_name=collection_name, points=points, wait=True)
	return True
	except Exception as e:
	print(f"Vector DB error: {e}")
	return False


	def query_vector_db(query, doc_id, collection_name="research_papers", n_results=3):
	"""Query Qdrant for similar chunks for the given document_id"""
	try:
	if not embedding_model or not qdrant_client or not VECTOR_SIZE:
	return []

	ensure_qdrant_collection(collection_name, VECTOR_SIZE)

	query_embedding = embedding_model.encode([query])
	query_vector = query_embedding[0].tolist() if hasattr(query_embedding, 'tolist') else list(query_embedding[0])

	flt = Filter(must=[FieldCondition(key="document_id", match=MatchValue(value=doc_id))])
	results = qdrant_client.search(
	collection_name=collection_name,
	query_vector=query_vector,
	limit=n_results,
	query_filter=flt,
	with_payload=True,
	with_vectors=False
	)

	documents = []
	for r in results or []:
	payload = getattr(r, 'payload', None) or {}
	documents.append(payload.get('content', ''))
	return {'documents': [documents]}
	except Exception as e:
	print(f"Vector DB query error: {e}")
	return []


	def get_all_chunks_for_document(doc_id: str, collection_name: str = "research_papers"):
	"""Retrieve all chunks for a document from Qdrant, ordered by chunk_index"""
	try:
	all_points = []
	next_offset = None
	flt = Filter(must=[FieldCondition(key="document_id", match=MatchValue(value=doc_id))])
	while True:
	points, next_offset = qdrant_client.scroll(
	collection_name=collection_name,
	scroll_filter=flt,
	limit=500,
	offset=next_offset,
	with_payload=True,
	with_vectors=False
	)
	all_points.extend(points)
	if not next_offset:
	break
	# Order by chunk_index
	all_points.sort(key=lambda p: p.payload.get('chunk_index', 0))
	return [p.payload.get('content', '') for p in all_points]
	except Exception as e:
	print(f"Qdrant scroll error: {e}")
	return []


	def get_all_documents(collection_name: str = "research_papers"):
	"""Get all unique documents from Qdrant with their metadata"""
	try:
	if not qdrant_client:
	return []

	# Get all points to extract unique documents
	all_points = []
	next_offset = None
	while True:
	points, next_offset = qdrant_client.scroll(
	collection_name=collection_name,
	limit=1000,
	offset=next_offset,
	with_payload=True,
	with_vectors=False
	)
	all_points.extend(points)
	if not next_offset:
	break

	# Group by document_id and extract metadata
	documents = {}
	for point in all_points:
	payload = point.payload or {}
	doc_id = payload.get('document_id')
	if not doc_id:
	continue

	if doc_id not in documents:
	# Create document metadata from first chunk
	doc_type = payload.get('type', 'document')

	# Generate proper title based on type
	title = payload.get('title', 'Untitled Document')
	if doc_type == 'arxiv_paper' and payload.get('pdf_url'):
	# Extract arXiv ID from URL for better title
	pdf_url = payload.get('pdf_url', '')
	if 'arxiv.org/pdf/' in pdf_url:
	arxiv_id = pdf_url.split('/')[-1].replace('.pdf', '')
	title = f"arXiv:{arxiv_id}"
	elif 'arxiv.org/abs/' in pdf_url:
	arxiv_id = pdf_url.split('/')[-1]
	title = f"arXiv:{arxiv_id}"
	elif doc_type == 'uploaded_document' and payload.get('filename'):
	title = payload.get('filename')

	documents[doc_id] = {
	'document_id': doc_id,
	'title': title,
	'authors': payload.get('authors', ['Unknown']),
	'published': payload.get('published', 'Unknown Date'),
	'category': payload.get('category', 'Research'),
	'filename': payload.get('filename', ''),
	'pdf_url': payload.get('pdf_url', ''),
	'type': doc_type,
	'upload_date': payload.get('upload_date', ''),
	'total_chunks': payload.get('total_chunks', 0),
	'word_count': payload.get('word_count', 0)
	}

	# Convert to list and sort by upload date (newest first)
	doc_list = list(documents.values())
	doc_list.sort(key=lambda x: x.get('upload_date', ''), reverse=True)

	return doc_list
	except Exception as e:
	print(f"Error getting documents: {e}")
	return []


	def get_document_metadata(doc_id: str, collection_name: str = "research_papers"):
	"""Get metadata for a specific document"""
	try:
	if not qdrant_client:
	return None

	# Get first chunk to extract metadata
	flt = Filter(must=[FieldCondition(key="document_id", match=MatchValue(value=doc_id))])
	results = qdrant_client.scroll(
	collection_name=collection_name,
	scroll_filter=flt,
	limit=1,
	with_payload=True,
	with_vectors=False
	)

	if results and results[0]:
	payload = results[0][0].payload or {}
	return {
	'document_id': doc_id,
	'title': payload.get('title', 'Untitled Document'),
	'authors': payload.get('authors', ['Unknown']),
	'published': payload.get('published', 'Unknown Date'),
	'category': payload.get('category', 'Research'),
	'filename': payload.get('filename', ''),
	'pdf_url': payload.get('pdf_url', ''),
	'type': payload.get('type', 'document'),
	'upload_date': payload.get('upload_date', ''),
	'total_chunks': payload.get('total_chunks', 0),
	'word_count': payload.get('word_count', 0)
	}
	return None
	except Exception as e:
	print(f"Error getting document metadata: {e}")
	return None

	# Paper ingestion helpers

	def resolve_pdf_url(url_or_pdf: str) -> str:
	if not url_or_pdf:
	return ''
	if 'arxiv.org/pdf/' in url_or_pdf and url_or_pdf.endswith('.pdf'):
	return url_or_pdf
	# convert arXiv abs to pdf
	m = re.search(r"arxiv\.org/(abs\|pdf)/([\w\.-]+)", url_or_pdf)
	if m:
	arxiv_id = m.group(2)
	if not arxiv_id.endswith('.pdf'):
	return f"https://arxiv.org/pdf/{arxiv_id}.pdf"
	return f"https://arxiv.org/pdf/{arxiv_id}"
	return url_or_pdf


	def download_pdf_to_temp(pdf_url: str) -> str:
	r = requests.get(pdf_url, stream=True, timeout=30)
	r.raise_for_status()
	with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
	for chunk in r.iter_content(chunk_size=8192):
	if chunk:
	tmp.write(chunk)
	return tmp.name


	def ingest_paper(pdf_url: str, paper_meta: dict = None) -> tuple:
	"""Download PDF, extract text, chunk, embed and store in Qdrant. Returns (doc_id, word_count)."""
	pdf_url = resolve_pdf_url(pdf_url)
	doc_id = str(uuid.uuid4())
	tmp_path = None
	try:
	tmp_path = download_pdf_to_temp(pdf_url)
	text_content = extract_text_from_pdf(tmp_path)
	if not text_content.strip():
	return None, 0
	metadata = {
	'source': 'arxiv',
	'pdf_url': pdf_url,
	'type': 'arxiv_paper'
	}
	if paper_meta:
	metadata.update(paper_meta)
	ok = add_document_to_vector_db(text_content, metadata, doc_id)
	if not ok:
	return None, 0
	# set active document
	session['active_document_id'] = doc_id
	return doc_id, len(text_content.split())
	finally:
	if tmp_path and os.path.exists(tmp_path):
	try:
	os.remove(tmp_path)
	except Exception:
	pass


	def generate_summary_from_qdrant(doc_id: str, max_chars: int = 80000) -> str:
	chunks = get_all_chunks_for_document(doc_id)
	if not chunks:
	return "No content available to summarize."
	# Concatenate up to max_chars
	full_text = ''
	for chunk in chunks:
	if len(full_text) + len(chunk) > max_chars:
	break
	full_text += (chunk + '\n')
	return generate_summary(full_text)


	def generate_chat_response(question, context_docs):
	"""Generate chat response using Gemini with context"""
	try:
	if not gemini_model:
	return "Chat functionality unavailable - API not configured"

	context = "\n\n".join(context_docs) if context_docs else ""

	prompt = f"""
	You are a research assistant helping users understand academic papers.
	Answer the following question based on the provided context from research papers.
	If the context doesn't contain relevant information, say so politely and suggest what information would be needed.

	Context from research papers:
	{context}

	Question: {question}

	Please provide a clear, accurate, and helpful response.
	"""

	response = gemini_model.generate_content(prompt)
	return response.text
	except Exception as e:
	logger.error(f"Chat response error: {e}")
	return "Error generating response. Please try again."

	# Routes
	@app.route('/')
	def index():
	"""Main page"""
	return render_template('index.html')

	@app.route('/search', methods=['POST'])
	def search_papers():
	"""Search arXiv papers"""
	try:
	data = request.get_json()
	query = data.get('query', '').strip()

	if not query:
	return jsonify({'error': 'Query is required'}), 400

	papers = search_arxiv_papers(query, max_results=10)
	return jsonify({'papers': papers})

	except Exception as e:
	return jsonify({'error': f'Search failed: {str(e)}'}), 500

	@app.route('/ingest-paper', methods=['POST'])
	def ingest_paper_endpoint():
	"""Ingest a paper PDF by URL: download, chunk, embed, store in Qdrant."""
	try:
	data = request.get_json()
	pdf_url = data.get('pdf_url') or data.get('url')
	title = data.get('title')
	authors = data.get('authors')
	published = data.get('published')
	if not pdf_url:
	return jsonify({'error': 'pdf_url is required'}), 400
	doc_id, word_count = ingest_paper(pdf_url, paper_meta={'title': title, 'authors': authors, 'published': published})
	if not doc_id:
	return jsonify({'error': 'Failed to ingest paper'}), 500
	return jsonify({'success': True, 'doc_id': doc_id, 'word_count': word_count})
	except Exception as e:
	logger.error(f"Ingestion failed: {e}", exc_info=True)
	return jsonify({'error': f'Ingestion failed: {str(e)}'}), 500

	@app.route('/upload', methods=['POST'])
	def upload_file():
	"""Handle file upload"""
	try:
	if 'file' not in request.files:
	return jsonify({'error': 'No file selected'}), 400

	file = request.files['file']
	if file.filename == '':
	return jsonify({'error': 'No file selected'}), 400

	if file and allowed_file(file.filename):
	filename = secure_filename(file.filename)

	# Generate a unique ID for this document session
	doc_id = str(uuid.uuid4())

	# Use a temporary file to avoid cluttering the upload folder
	with tempfile.NamedTemporaryFile(delete=False, suffix=f"_{filename}") as tmp_file:
	file.save(tmp_file.name)
	tmp_file_path = tmp_file.name

	# Extract text from document
	text_content = process_document(tmp_file_path, filename)

	# Clean up temporary file immediately
	os.remove(tmp_file_path)

	if not text_content.strip():
	return jsonify({'error': 'Could not extract text from file'}), 400

	# Generate summary
	summary = generate_summary(text_content)

	# Add to vector database for chat
	metadata = {
	'filename': file.filename,
	'upload_date': datetime.now().isoformat(),
	'type': 'uploaded_document'
	}
	add_document_to_vector_db(text_content, metadata, doc_id)

	# Store the active document ID in the session
	session['active_document_id'] = doc_id

	return jsonify({
	'success': True,
	'filename': file.filename,
	'summary': summary,
	'word_count': len(text_content.split()),
	'doc_id': doc_id # Send doc_id to frontend
	})

	return jsonify({'error': 'Invalid file type'}), 400

	except Exception as e:
	logger.error(f"Upload failed: {e}", exc_info=True)
	return jsonify({'error': f'Upload failed: {str(e)}'}), 500

	@app.route('/summarize-paper', methods=['POST'])
	def summarize_paper():
	"""Summarize paper: if doc_id provided, summarize from Qdrant; else ingest then summarize."""
	try:
	data = request.get_json()
	doc_id = data.get('doc_id')
	paper_url = data.get('url', '').strip()
	pdf_url = data.get('pdf_url')

	if not doc_id and not (paper_url or pdf_url):
	return jsonify({'error': 'doc_id or url/pdf_url is required'}), 400

	# If doc_id not provided, ingest first
	paper_data = None
	if not doc_id:
	# If only abs URL provided, try resolve via arxiv client for metadata
	try:
	# Extract arXiv ID from URL
	arxiv_id = None
	if paper_url:
	arxiv_id = paper_url.split('/')[-1].replace('.pdf', '')
	if arxiv_id:
	client = arxiv.Client()
	search = arxiv.Search(id_list=[arxiv_id])
	for result in client.results(search):
	paper_data = {
	'title': result.title,
	'authors': [author.name for author in result.authors],
	'summary': result.summary,
	'url': result.entry_id,
	'pdf_url': result.pdf_url,
	'published': result.published.strftime('%Y-%m-%d')
	}
	break
	except Exception:
	paper_data = None
	ingest_pdf = pdf_url or (paper_data['pdf_url'] if paper_data and paper_data.get('pdf_url') else resolve_pdf_url(paper_url))
	new_doc_id, _ = ingest_paper(ingest_pdf, paper_meta=paper_data or {})
	if not new_doc_id:
	return jsonify({'error': 'Failed to ingest paper'}), 500
	doc_id = new_doc_id
	session['active_document_id'] = doc_id

	# Summarize from Qdrant chunks
	summary = generate_summary_from_qdrant(doc_id)

	return jsonify({
	'success': True,
	'summary': summary,
	'doc_id': doc_id,
	'paper': paper_data
	})
	except Exception as e:
	return jsonify({'error': f'Request failed: {str(e)}'}), 500

	@app.route('/chat', methods=['POST'])
	def chat():
	"""Handle chat queries for the active document"""
	try:
	data = request.get_json()
	# Accept both 'message' and 'question' for backward compatibility
	question = data.get('message', data.get('question', '')).strip()
	doc_id = session.get('active_document_id')

	if not question:
	return jsonify({'error': 'Message is required'}), 400

	# If no active document, provide general help
	if not doc_id:
	if not gemini_model:
	return jsonify({'error': 'AI service is not available. Please check your API configuration.'}), 500

	# Generate a general response without document context
	try:
	prompt = f"""
	You are a helpful AI research assistant for Research Radar. The user asked: "{question}"

	Since no document is currently loaded, provide a helpful response about:
	1. How to use Research Radar (search papers, upload documents, chat features)
	2. General research guidance if the question is research-related
	3. Suggest they upload a document or search for papers to get more specific help

	Keep your response friendly and informative.
	"""

	response = gemini_model.generate_content(prompt)
	return jsonify({
	'success': True,
	'response': response.text,
	'context_found': False,
	'no_document': True
	})
	except Exception as e:
	return jsonify({
	'success': True,
	'response': "Hello! I'm your AI research assistant. To get started, please upload a document or search for papers using the navigation above. Then I can help you analyze content, answer questions, and provide insights about your research materials.",
	'context_found': False,
	'no_document': True
	})

	# Query vector database for relevant context from the active document
	search_results = query_vector_db(question, doc_id)

	context_docs = []
	if search_results and isinstance(search_results, dict) and 'documents' in search_results:
	context_docs = search_results['documents'][0]

	# Generate response
	response = generate_chat_response(question, context_docs)

	return jsonify({
	'success': True,
	'response': response,
	'context_found': len(context_docs) > 0
	})

	except Exception as e:
	return jsonify({'error': f'Chat failed: {str(e)}'}), 500

	@app.route('/documents', methods=['GET'])
	def get_documents():
	"""Get all documents from the vector database"""
	try:
	documents = get_all_documents()
	return jsonify({'success': True, 'documents': documents})
	except Exception as e:
	return jsonify({'error': f'Failed to get documents: {str(e)}'}), 500


	@app.route('/documents/<doc_id>', methods=['GET'])
	def get_document(doc_id):
	"""Get a specific document's metadata"""
	try:
	metadata = get_document_metadata(doc_id)
	if not metadata:
	return jsonify({'error': 'Document not found'}), 404
	return jsonify({'success': True, 'document': metadata})
	except Exception as e:
	return jsonify({'error': f'Failed to get document: {str(e)}'}), 500


	@app.route('/documents/<doc_id>/summary', methods=['GET'])
	def get_document_summary(doc_id):
	"""Get summary for a specific document"""
	try:
	summary = generate_summary_from_qdrant(doc_id)
	metadata = get_document_metadata(doc_id)
	if not metadata:
	return jsonify({'error': 'Document not found'}), 404
	return jsonify({
	'success': True,
	'summary': summary,
	'document': metadata
	})
	except Exception as e:
	return jsonify({'error': f'Failed to get summary: {str(e)}'}), 500


	@app.route('/documents/<doc_id>/activate', methods=['POST'])
	def activate_document(doc_id):
	"""Set a document as the active document for chat"""
	try:
	metadata = get_document_metadata(doc_id)
	if not metadata:
	return jsonify({'error': 'Document not found'}), 404

	session['active_document_id'] = doc_id
	return jsonify({
	'success': True,
	'message': 'Document activated',
	'document': metadata
	})
	except Exception as e:
	return jsonify({'error': f'Failed to activate document: {str(e)}'}), 500


	@app.route('/documents/<doc_id>', methods=['DELETE'])
	def delete_document(doc_id):
	"""Delete a document from Qdrant"""
	try:
	if not qdrant_client:
	return jsonify({'error': 'Vector database not available'}), 500

	# Delete all points for this document
	flt = Filter(must=[FieldCondition(key="document_id", match=MatchValue(value=doc_id))])
	qdrant_client.delete(
	collection_name="research_papers",
	points_selector=flt
	)

	return jsonify({
	'success': True,
	'message': 'Document deleted successfully'
	})
	except Exception as e:
	return jsonify({'error': f'Failed to delete document: {str(e)}'}), 500


	@app.route('/documents', methods=['DELETE'])
	def clear_all_documents():
	"""Clear all documents from Qdrant"""
	try:
	if not qdrant_client:
	return jsonify({'error': 'Vector database not available'}), 500

	# Delete all points
	qdrant_client.delete(
	collection_name="research_papers",
	points_selector=None
	)

	return jsonify({
	'success': True,
	'message': 'All documents cleared successfully'
	})
	except Exception as e:
	return jsonify({'error': f'Failed to clear documents: {str(e)}'}), 500


	@app.route('/clear-session', methods=['POST'])
	def clear_session():
	"""Clear the active document from the session"""
	session.pop('active_document_id', None)
	return jsonify({'success': True, 'message': 'Session cleared.'})

	@app.route('/health')
	def health_check():
	"""Health check endpoint"""
	return jsonify({
	'status': 'healthy',
	'gemini_available': gemini_model is not None,
	'embeddings_available': embedding_model is not None,
	'vector_db_available': qdrant_client is not None
	})

	if __name__ == '__main__':
	print("🚀 Research Radar - Starting Flask Application...")
	print("📚 Features: arXiv search, document upload, AI summaries, chat functionality")
	print("🔑 Make sure to set GEMINI_API_KEY in your .env file")
	print("🗄 Using Qdrant as Vector DB. Ensure Qdrant is reachable via QDRANT_URL")

	# Get port from environment variable (for Hugging Face Spaces)
	port = int(os.environ.get('PORT', 5000))
	debug = os.environ.get('FLASK_ENV') == 'development'

	print(f"🌐 Access the app at: http://localhost:{port}")

	app.run(debug=debug, host='0.0.0.0', port=port)