Spaces:

Renangi
/

ragbench-rag-eval

Running

App Files Files Community

Renangi commited on 24 days ago

Commit

5e74947

1 Parent(s): 5de7cee

code changes as per mentor points and daigram2

Browse files

Files changed (1) hide show

ragbench_eval/retriever.py +63 -64

ragbench_eval/retriever.py CHANGED Viewed

@@ -1,5 +1,4 @@
 from typing import List, Tuple
 import numpy as np
 from sentence_transformers import SentenceTransformer
 from sklearn.metrics.pairwise import cosine_similarity
@@ -7,43 +6,69 @@ from rank_bm25 import BM25Okapi
 from .config import EMBEDDING_MODEL
 class ExampleRetriever:
-    """
-    Hybrid retriever used in this project.
     For each RAGBench example we receive a list of documents, where each
-    document is itself a list of (sentence_key, sentence_text) pairs.
     This retriever:
-      1. Uses **all** documents provided for that example.
-      2. Builds both a dense semantic index (SentenceTransformer) and a
-         sparse lexical index (BM25).
-      3. Computes a **hybrid score** = alpha * dense + (1 - alpha) * BM25.
-      4. Returns the indices of the top-k highest scoring documents.
-    """
     def __init__(self, alpha: float = 0.5) -> None:
-        # Dense encoder used for semantic similarity
         self.embedder = SentenceTransformer(EMBEDDING_MODEL)
-        # Weight for dense vs. BM25 scores in the hybrid fusion
         self.alpha = alpha
     def _encode(self, texts: List[str]) -> np.ndarray:
-        """Encode a list of texts into dense vectors."""
         return self.embedder.encode(texts, show_progress_bar=False)
     @staticmethod
     def _normalize(scores: np.ndarray) -> np.ndarray:
-        """Min-max normalize a 1D score array to [0, 1]."""
         if scores.size == 0:
             return scores
         s_min = float(scores.min())
         s_max = float(scores.max())
-        if s_max - s_min < 1e-8:
-            # All scores are (almost) identical; return zeros so that
-            # this component does not dominate the hybrid score.
-            return np.zeros_like(scores, dtype=float)
         return (scores - s_min) / (s_max - s_min)
     def rank_docs(
@@ -52,60 +77,34 @@ class ExampleRetriever:
         documents_sentences: List[List[Tuple[str, str]]],
         k: int = 4,
     ) -> List[int]:
-        """
-        Rank the documents for a single RAGBench example.
-        Parameters
-        ----------
-        question:
-            The user question string.
-        documents_sentences:
-            List of documents. Each document is a list of
-            (sentence_key, sentence_text) pairs.
-        k:
-            Number of top documents to return.
-        Returns
-        -------
-        List[int]
-            Indices of the top-k documents in `documents_sentences`,
-            sorted from most to least relevant.
         """
         if not documents_sentences:
             return []
-        # ------------------------------------------------------------------
-        # 1) Flatten each document into a single text block
-        # ------------------------------------------------------------------
-        doc_texts: List[str] = [
-            " ".join(sent for _, sent in doc) for doc in documents_sentences
-        ]
-        # ------------------------------------------------------------------
-        # 2) Dense semantic similarity using SentenceTransformer
-        # ------------------------------------------------------------------
         q_emb = self._encode([question])
         d_emb = self._encode(doc_texts)
-        dense_scores = cosine_similarity(q_emb, d_emb)[0]  # shape: (n_docs,)
-        # ------------------------------------------------------------------
-        # 3) Sparse lexical similarity using BM25
-        # ------------------------------------------------------------------
-        tokenized_docs = [doc.split() for doc in doc_texts]
-        bm25 = BM25Okapi(tokenized_docs)
-        bm25_scores = np.array(bm25.get_scores(question.split()), dtype=float)
-        # ------------------------------------------------------------------
-        # 4) Hybrid fusion of dense + BM25 scores
-        # ------------------------------------------------------------------
-        dense_norm = self._normalize(dense_scores)
         bm25_norm = self._normalize(bm25_scores)
         hybrid_scores = self.alpha * dense_norm + (1.0 - self.alpha) * bm25_norm
-        # ------------------------------------------------------------------
-        # 5) Return indices of top-k documents
-        # ------------------------------------------------------------------
-        topk = min(k, len(doc_texts))
-        topk_idx = np.argsort(hybrid_scores)[::-1][:topk]
         return topk_idx.tolist()

 from typing import List, Tuple
 import numpy as np
 from sentence_transformers import SentenceTransformer
 from sklearn.metrics.pairwise import cosine_similarity
 from .config import EMBEDDING_MODEL
 class ExampleRetriever:
+    """Hybrid BM25 + dense retriever over per-example documents.
     For each RAGBench example we receive a list of documents, where each
+    document is a list of (sentence_key, sentence_text) tuples.
     This retriever:
+    - Builds a temporary BM25 index over all documents of the example.
+    - Encodes each document into a dense embedding with SentenceTransformer.
+    - Computes a *hybrid* relevance score that is a weighted combination
+      of BM25 (lexical) and dense cosine similarity (semantic).
+    - Returns indices of the top-k most relevant documents.
+    This design matches the RAGBench diagram block:
+      Retrieval -> BM25 + LLM-Embedder -> Hybrid Search.
+    """  # noqa: E501
     def __init__(self, alpha: float = 0.5) -> None:
+        """Create the retriever.
+        Args:
+            alpha: Weight for the dense similarity score. The BM25 weight
+                is (1 - alpha). alpha=0.5 gives an even hybrid; setting
+                alpha=1.0 gives dense-only, alpha=0.0 gives BM25-only.
+        """
         self.embedder = SentenceTransformer(EMBEDDING_MODEL)
         self.alpha = alpha
     def _encode(self, texts: List[str]) -> np.ndarray:
+        """Encode a list of texts into dense embeddings."""
         return self.embedder.encode(texts, show_progress_bar=False)
+    def _prepare_docs(
+        self,
+        documents_sentences: List[List[Tuple[str, str]]],
+    ) -> Tuple[List[str], List[List[str]]]:
+        """Convert documents into plain text and tokenized form.
+        Returns:
+            doc_texts: one concatenated string per document.
+            tokenized_docs: tokenized version used for BM25.
+        """
+        doc_texts: List[str] = [
+            " ".join(sent for _, sent in doc) for doc in documents_sentences
+        ]
+        tokenized_docs: List[List[str]] = [
+            text.lower().split() for text in doc_texts
+        ]
+        return doc_texts, tokenized_docs
     @staticmethod
     def _normalize(scores: np.ndarray) -> np.ndarray:
+        """Normalize an array of scores to the range [0, 1].
+        If all scores are equal, returns a vector of 0.5 to avoid division
+        by zero and still allow a meaningful hybrid combination.
+        """
         if scores.size == 0:
             return scores
         s_min = float(scores.min())
         s_max = float(scores.max())
+        if s_max - s_min < 1e-12:
+            return np.full_like(scores, 0.5, dtype=float)
         return (scores - s_min) / (s_max - s_min)
     def rank_docs(
         documents_sentences: List[List[Tuple[str, str]]],
         k: int = 4,
     ) -> List[int]:
+        """Return indices of the top-k relevant documents for a question.
+        This uses a hybrid BM25 + dense approach over *all* documents
+        passed for the current example.
         """
         if not documents_sentences:
             return []
+        # Prepare document representations
+        doc_texts, tokenized_docs = self._prepare_docs(documents_sentences)
+        # --- BM25 sparse scores ---
+        bm25 = BM25Okapi(tokenized_docs)
+        bm25_scores = np.array(
+            bm25.get_scores(question.lower().split()), dtype=float
+        )
+        # --- Dense cosine similarity scores ---
         q_emb = self._encode([question])
         d_emb = self._encode(doc_texts)
+        dense_scores = cosine_similarity(q_emb, d_emb)[0]
+        # --- Hybrid score: weighted combination ---
         bm25_norm = self._normalize(bm25_scores)
+        dense_norm = self._normalize(dense_scores)
         hybrid_scores = self.alpha * dense_norm + (1.0 - self.alpha) * bm25_norm
+        # Pick top-k indices
+        k = min(k, len(hybrid_scores))
+        topk_idx = np.argsort(hybrid_scores)[::-1][:k]
         return topk_idx.tolist()