Spaces:
Running
Running
| from typing import List, Tuple | |
| import numpy as np | |
| from sentence_transformers import SentenceTransformer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| from rank_bm25 import BM25Okapi | |
| from .config import EMBEDDING_MODEL | |
| class ExampleRetriever: | |
| """Hybrid BM25 + dense retriever over per-example documents. | |
| For each RAGBench example we receive a list of documents, where each | |
| document is a list of (sentence_key, sentence_text) tuples. | |
| This retriever: | |
| - Builds a temporary BM25 index over all documents of the example. | |
| - Encodes each document into a dense embedding with SentenceTransformer. | |
| - Computes a *hybrid* relevance score that is a weighted combination | |
| of BM25 (lexical) and dense cosine similarity (semantic). | |
| - Returns indices of the top-k most relevant documents. | |
| This design matches the RAGBench diagram block: | |
| Retrieval -> BM25 + LLM-Embedder -> Hybrid Search. | |
| """ # noqa: E501 | |
| def __init__(self, alpha: float = 0.5) -> None: | |
| """Create the retriever. | |
| Args: | |
| alpha: Weight for the dense similarity score. The BM25 weight | |
| is (1 - alpha). alpha=0.5 gives an even hybrid; setting | |
| alpha=1.0 gives dense-only, alpha=0.0 gives BM25-only. | |
| """ | |
| self.embedder = SentenceTransformer(EMBEDDING_MODEL) | |
| self.alpha = alpha | |
| def _encode(self, texts: List[str]) -> np.ndarray: | |
| """Encode a list of texts into dense embeddings.""" | |
| return self.embedder.encode(texts, show_progress_bar=False) | |
| def _prepare_docs( | |
| self, | |
| documents_sentences: List[List[Tuple[str, str]]], | |
| ) -> Tuple[List[str], List[List[str]]]: | |
| """Convert documents into plain text and tokenized form. | |
| Returns: | |
| doc_texts: one concatenated string per document. | |
| tokenized_docs: tokenized version used for BM25. | |
| """ | |
| doc_texts: List[str] = [ | |
| " ".join(sent for _, sent in doc) for doc in documents_sentences | |
| ] | |
| tokenized_docs: List[List[str]] = [ | |
| text.lower().split() for text in doc_texts | |
| ] | |
| return doc_texts, tokenized_docs | |
| def _normalize(scores: np.ndarray) -> np.ndarray: | |
| """Normalize an array of scores to the range [0, 1]. | |
| If all scores are equal, returns a vector of 0.5 to avoid division | |
| by zero and still allow a meaningful hybrid combination. | |
| """ | |
| if scores.size == 0: | |
| return scores | |
| s_min = float(scores.min()) | |
| s_max = float(scores.max()) | |
| if s_max - s_min < 1e-12: | |
| return np.full_like(scores, 0.5, dtype=float) | |
| return (scores - s_min) / (s_max - s_min) | |
| def rank_docs( | |
| self, | |
| question: str, | |
| documents_sentences: List[List[Tuple[str, str]]], | |
| k: int = 4, | |
| ) -> List[int]: | |
| """Return indices of the top-k relevant documents for a question. | |
| This uses a hybrid BM25 + dense approach over *all* documents | |
| passed for the current example. | |
| """ | |
| if not documents_sentences: | |
| return [] | |
| # Prepare document representations | |
| doc_texts, tokenized_docs = self._prepare_docs(documents_sentences) | |
| # --- BM25 sparse scores --- | |
| bm25 = BM25Okapi(tokenized_docs) | |
| bm25_scores = np.array( | |
| bm25.get_scores(question.lower().split()), dtype=float | |
| ) | |
| # --- Dense cosine similarity scores --- | |
| q_emb = self._encode([question]) | |
| d_emb = self._encode(doc_texts) | |
| dense_scores = cosine_similarity(q_emb, d_emb)[0] | |
| # --- Hybrid score: weighted combination --- | |
| bm25_norm = self._normalize(bm25_scores) | |
| dense_norm = self._normalize(dense_scores) | |
| hybrid_scores = self.alpha * dense_norm + (1.0 - self.alpha) * bm25_norm | |
| # Pick top-k indices | |
| k = min(k, len(hybrid_scores)) | |
| topk_idx = np.argsort(hybrid_scores)[::-1][:k] | |
| return topk_idx.tolist() | |