Spaces:

Renangi
/

ragbench-rag-eval

Running

App Files Files Community

ragbench-rag-eval / ragbench_eval /retriever.py

Renangi

code changes as per mentor points and daigram2

5e74947 24 days ago

raw

history blame

4.04 kB

	from typing import List, Tuple
	import numpy as np
	from sentence_transformers import SentenceTransformer
	from sklearn.metrics.pairwise import cosine_similarity
	from rank_bm25 import BM25Okapi

	from .config import EMBEDDING_MODEL

	class ExampleRetriever:
	"""Hybrid BM25 + dense retriever over per-example documents.

	For each RAGBench example we receive a list of documents, where each
	document is a list of (sentence_key, sentence_text) tuples.

	This retriever:
	- Builds a temporary BM25 index over all documents of the example.
	- Encodes each document into a dense embedding with SentenceTransformer.
	- Computes a hybrid relevance score that is a weighted combination
	of BM25 (lexical) and dense cosine similarity (semantic).
	- Returns indices of the top-k most relevant documents.

	This design matches the RAGBench diagram block:
	Retrieval -> BM25 + LLM-Embedder -> Hybrid Search.
	""" # noqa: E501

	def __init__(self, alpha: float = 0.5) -> None:
	"""Create the retriever.

	Args:
	alpha: Weight for the dense similarity score. The BM25 weight
	is (1 - alpha). alpha=0.5 gives an even hybrid; setting
	alpha=1.0 gives dense-only, alpha=0.0 gives BM25-only.
	"""
	self.embedder = SentenceTransformer(EMBEDDING_MODEL)
	self.alpha = alpha

	def _encode(self, texts: List[str]) -> np.ndarray:
	"""Encode a list of texts into dense embeddings."""
	return self.embedder.encode(texts, show_progress_bar=False)

	def _prepare_docs(
	self,
	documents_sentences: List[List[Tuple[str, str]]],
	) -> Tuple[List[str], List[List[str]]]:
	"""Convert documents into plain text and tokenized form.

	Returns:
	doc_texts: one concatenated string per document.
	tokenized_docs: tokenized version used for BM25.
	"""
	doc_texts: List[str] = [
	" ".join(sent for _, sent in doc) for doc in documents_sentences
	]
	tokenized_docs: List[List[str]] = [
	text.lower().split() for text in doc_texts
	]
	return doc_texts, tokenized_docs

	@staticmethod
	def _normalize(scores: np.ndarray) -> np.ndarray:
	"""Normalize an array of scores to the range [0, 1].

	If all scores are equal, returns a vector of 0.5 to avoid division
	by zero and still allow a meaningful hybrid combination.
	"""
	if scores.size == 0:
	return scores
	s_min = float(scores.min())
	s_max = float(scores.max())
	if s_max - s_min < 1e-12:
	return np.full_like(scores, 0.5, dtype=float)
	return (scores - s_min) / (s_max - s_min)

	def rank_docs(
	self,
	question: str,
	documents_sentences: List[List[Tuple[str, str]]],
	k: int = 4,
	) -> List[int]:
	"""Return indices of the top-k relevant documents for a question.

	This uses a hybrid BM25 + dense approach over all documents
	passed for the current example.
	"""
	if not documents_sentences:
	return []

	# Prepare document representations
	doc_texts, tokenized_docs = self._prepare_docs(documents_sentences)

	# --- BM25 sparse scores ---
	bm25 = BM25Okapi(tokenized_docs)
	bm25_scores = np.array(
	bm25.get_scores(question.lower().split()), dtype=float
	)

	# --- Dense cosine similarity scores ---
	q_emb = self._encode([question])
	d_emb = self._encode(doc_texts)
	dense_scores = cosine_similarity(q_emb, d_emb)[0]

	# --- Hybrid score: weighted combination ---
	bm25_norm = self._normalize(bm25_scores)
	dense_norm = self._normalize(dense_scores)
	hybrid_scores = self.alpha * dense_norm + (1.0 - self.alpha) * bm25_norm

	# Pick top-k indices
	k = min(k, len(hybrid_scores))
	topk_idx = np.argsort(hybrid_scores)[::-1][:k]
	return topk_idx.tolist()