from typing import List, Tuple import numpy as np from sentence_transformers import SentenceTransformer from sklearn.metrics.pairwise import cosine_similarity from rank_bm25 import BM25Okapi from .config import EMBEDDING_MODEL class ExampleRetriever: """Hybrid BM25 + dense retriever over per-example documents. For each RAGBench example we receive a list of documents, where each document is a list of (sentence_key, sentence_text) tuples. This retriever: - Builds a temporary BM25 index over all documents of the example. - Encodes each document into a dense embedding with SentenceTransformer. - Computes a *hybrid* relevance score that is a weighted combination of BM25 (lexical) and dense cosine similarity (semantic). - Returns indices of the top-k most relevant documents. This design matches the RAGBench diagram block: Retrieval -> BM25 + LLM-Embedder -> Hybrid Search. """ # noqa: E501 def __init__(self, alpha: float = 0.5) -> None: """Create the retriever. Args: alpha: Weight for the dense similarity score. The BM25 weight is (1 - alpha). alpha=0.5 gives an even hybrid; setting alpha=1.0 gives dense-only, alpha=0.0 gives BM25-only. """ self.embedder = SentenceTransformer(EMBEDDING_MODEL) self.alpha = alpha def _encode(self, texts: List[str]) -> np.ndarray: """Encode a list of texts into dense embeddings.""" return self.embedder.encode(texts, show_progress_bar=False) def _prepare_docs( self, documents_sentences: List[List[Tuple[str, str]]], ) -> Tuple[List[str], List[List[str]]]: """Convert documents into plain text and tokenized form. Returns: doc_texts: one concatenated string per document. tokenized_docs: tokenized version used for BM25. """ doc_texts: List[str] = [ " ".join(sent for _, sent in doc) for doc in documents_sentences ] tokenized_docs: List[List[str]] = [ text.lower().split() for text in doc_texts ] return doc_texts, tokenized_docs @staticmethod def _normalize(scores: np.ndarray) -> np.ndarray: """Normalize an array of scores to the range [0, 1]. If all scores are equal, returns a vector of 0.5 to avoid division by zero and still allow a meaningful hybrid combination. """ if scores.size == 0: return scores s_min = float(scores.min()) s_max = float(scores.max()) if s_max - s_min < 1e-12: return np.full_like(scores, 0.5, dtype=float) return (scores - s_min) / (s_max - s_min) def rank_docs( self, question: str, documents_sentences: List[List[Tuple[str, str]]], k: int = 4, ) -> List[int]: """Return indices of the top-k relevant documents for a question. This uses a hybrid BM25 + dense approach over *all* documents passed for the current example. """ if not documents_sentences: return [] # Prepare document representations doc_texts, tokenized_docs = self._prepare_docs(documents_sentences) # --- BM25 sparse scores --- bm25 = BM25Okapi(tokenized_docs) bm25_scores = np.array( bm25.get_scores(question.lower().split()), dtype=float ) # --- Dense cosine similarity scores --- q_emb = self._encode([question]) d_emb = self._encode(doc_texts) dense_scores = cosine_similarity(q_emb, d_emb)[0] # --- Hybrid score: weighted combination --- bm25_norm = self._normalize(bm25_scores) dense_norm = self._normalize(dense_scores) hybrid_scores = self.alpha * dense_norm + (1.0 - self.alpha) * bm25_norm # Pick top-k indices k = min(k, len(hybrid_scores)) topk_idx = np.argsort(hybrid_scores)[::-1][:k] return topk_idx.tolist()