Renangi's picture
code changes as per mentor points and daigram2
5e74947
raw
history blame
4.04 kB
from typing import List, Tuple
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from rank_bm25 import BM25Okapi
from .config import EMBEDDING_MODEL
class ExampleRetriever:
"""Hybrid BM25 + dense retriever over per-example documents.
For each RAGBench example we receive a list of documents, where each
document is a list of (sentence_key, sentence_text) tuples.
This retriever:
- Builds a temporary BM25 index over all documents of the example.
- Encodes each document into a dense embedding with SentenceTransformer.
- Computes a *hybrid* relevance score that is a weighted combination
of BM25 (lexical) and dense cosine similarity (semantic).
- Returns indices of the top-k most relevant documents.
This design matches the RAGBench diagram block:
Retrieval -> BM25 + LLM-Embedder -> Hybrid Search.
""" # noqa: E501
def __init__(self, alpha: float = 0.5) -> None:
"""Create the retriever.
Args:
alpha: Weight for the dense similarity score. The BM25 weight
is (1 - alpha). alpha=0.5 gives an even hybrid; setting
alpha=1.0 gives dense-only, alpha=0.0 gives BM25-only.
"""
self.embedder = SentenceTransformer(EMBEDDING_MODEL)
self.alpha = alpha
def _encode(self, texts: List[str]) -> np.ndarray:
"""Encode a list of texts into dense embeddings."""
return self.embedder.encode(texts, show_progress_bar=False)
def _prepare_docs(
self,
documents_sentences: List[List[Tuple[str, str]]],
) -> Tuple[List[str], List[List[str]]]:
"""Convert documents into plain text and tokenized form.
Returns:
doc_texts: one concatenated string per document.
tokenized_docs: tokenized version used for BM25.
"""
doc_texts: List[str] = [
" ".join(sent for _, sent in doc) for doc in documents_sentences
]
tokenized_docs: List[List[str]] = [
text.lower().split() for text in doc_texts
]
return doc_texts, tokenized_docs
@staticmethod
def _normalize(scores: np.ndarray) -> np.ndarray:
"""Normalize an array of scores to the range [0, 1].
If all scores are equal, returns a vector of 0.5 to avoid division
by zero and still allow a meaningful hybrid combination.
"""
if scores.size == 0:
return scores
s_min = float(scores.min())
s_max = float(scores.max())
if s_max - s_min < 1e-12:
return np.full_like(scores, 0.5, dtype=float)
return (scores - s_min) / (s_max - s_min)
def rank_docs(
self,
question: str,
documents_sentences: List[List[Tuple[str, str]]],
k: int = 4,
) -> List[int]:
"""Return indices of the top-k relevant documents for a question.
This uses a hybrid BM25 + dense approach over *all* documents
passed for the current example.
"""
if not documents_sentences:
return []
# Prepare document representations
doc_texts, tokenized_docs = self._prepare_docs(documents_sentences)
# --- BM25 sparse scores ---
bm25 = BM25Okapi(tokenized_docs)
bm25_scores = np.array(
bm25.get_scores(question.lower().split()), dtype=float
)
# --- Dense cosine similarity scores ---
q_emb = self._encode([question])
d_emb = self._encode(doc_texts)
dense_scores = cosine_similarity(q_emb, d_emb)[0]
# --- Hybrid score: weighted combination ---
bm25_norm = self._normalize(bm25_scores)
dense_norm = self._normalize(dense_scores)
hybrid_scores = self.alpha * dense_norm + (1.0 - self.alpha) * bm25_norm
# Pick top-k indices
k = min(k, len(hybrid_scores))
topk_idx = np.argsort(hybrid_scores)[::-1][:k]
return topk_idx.tolist()