Renangi commited on
Commit
5e74947
·
1 Parent(s): 5de7cee

code changes as per mentor points and daigram2

Browse files
Files changed (1) hide show
  1. ragbench_eval/retriever.py +63 -64
ragbench_eval/retriever.py CHANGED
@@ -1,5 +1,4 @@
1
  from typing import List, Tuple
2
-
3
  import numpy as np
4
  from sentence_transformers import SentenceTransformer
5
  from sklearn.metrics.pairwise import cosine_similarity
@@ -7,43 +6,69 @@ from rank_bm25 import BM25Okapi
7
 
8
  from .config import EMBEDDING_MODEL
9
 
10
-
11
  class ExampleRetriever:
12
- """
13
- Hybrid retriever used in this project.
14
 
15
  For each RAGBench example we receive a list of documents, where each
16
- document is itself a list of (sentence_key, sentence_text) pairs.
17
 
18
  This retriever:
19
- 1. Uses **all** documents provided for that example.
20
- 2. Builds both a dense semantic index (SentenceTransformer) and a
21
- sparse lexical index (BM25).
22
- 3. Computes a **hybrid score** = alpha * dense + (1 - alpha) * BM25.
23
- 4. Returns the indices of the top-k highest scoring documents.
24
- """
 
 
 
25
 
26
  def __init__(self, alpha: float = 0.5) -> None:
27
- # Dense encoder used for semantic similarity
 
 
 
 
 
 
28
  self.embedder = SentenceTransformer(EMBEDDING_MODEL)
29
- # Weight for dense vs. BM25 scores in the hybrid fusion
30
  self.alpha = alpha
31
 
32
  def _encode(self, texts: List[str]) -> np.ndarray:
33
- """Encode a list of texts into dense vectors."""
34
  return self.embedder.encode(texts, show_progress_bar=False)
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  @staticmethod
37
  def _normalize(scores: np.ndarray) -> np.ndarray:
38
- """Min-max normalize a 1D score array to [0, 1]."""
 
 
 
 
39
  if scores.size == 0:
40
  return scores
41
  s_min = float(scores.min())
42
  s_max = float(scores.max())
43
- if s_max - s_min < 1e-8:
44
- # All scores are (almost) identical; return zeros so that
45
- # this component does not dominate the hybrid score.
46
- return np.zeros_like(scores, dtype=float)
47
  return (scores - s_min) / (s_max - s_min)
48
 
49
  def rank_docs(
@@ -52,60 +77,34 @@ class ExampleRetriever:
52
  documents_sentences: List[List[Tuple[str, str]]],
53
  k: int = 4,
54
  ) -> List[int]:
55
- """
56
- Rank the documents for a single RAGBench example.
57
-
58
- Parameters
59
- ----------
60
- question:
61
- The user question string.
62
- documents_sentences:
63
- List of documents. Each document is a list of
64
- (sentence_key, sentence_text) pairs.
65
- k:
66
- Number of top documents to return.
67
-
68
- Returns
69
- -------
70
- List[int]
71
- Indices of the top-k documents in `documents_sentences`,
72
- sorted from most to least relevant.
73
  """
74
  if not documents_sentences:
75
  return []
76
 
77
- # ------------------------------------------------------------------
78
- # 1) Flatten each document into a single text block
79
- # ------------------------------------------------------------------
80
- doc_texts: List[str] = [
81
- " ".join(sent for _, sent in doc) for doc in documents_sentences
82
- ]
83
 
84
- # ------------------------------------------------------------------
85
- # 2) Dense semantic similarity using SentenceTransformer
86
- # ------------------------------------------------------------------
 
 
 
 
87
  q_emb = self._encode([question])
88
  d_emb = self._encode(doc_texts)
89
- dense_scores = cosine_similarity(q_emb, d_emb)[0] # shape: (n_docs,)
90
-
91
- # ------------------------------------------------------------------
92
- # 3) Sparse lexical similarity using BM25
93
- # ------------------------------------------------------------------
94
- tokenized_docs = [doc.split() for doc in doc_texts]
95
- bm25 = BM25Okapi(tokenized_docs)
96
- bm25_scores = np.array(bm25.get_scores(question.split()), dtype=float)
97
 
98
- # ------------------------------------------------------------------
99
- # 4) Hybrid fusion of dense + BM25 scores
100
- # ------------------------------------------------------------------
101
- dense_norm = self._normalize(dense_scores)
102
  bm25_norm = self._normalize(bm25_scores)
103
-
104
  hybrid_scores = self.alpha * dense_norm + (1.0 - self.alpha) * bm25_norm
105
 
106
- # ------------------------------------------------------------------
107
- # 5) Return indices of top-k documents
108
- # ------------------------------------------------------------------
109
- topk = min(k, len(doc_texts))
110
- topk_idx = np.argsort(hybrid_scores)[::-1][:topk]
111
  return topk_idx.tolist()
 
1
  from typing import List, Tuple
 
2
  import numpy as np
3
  from sentence_transformers import SentenceTransformer
4
  from sklearn.metrics.pairwise import cosine_similarity
 
6
 
7
  from .config import EMBEDDING_MODEL
8
 
 
9
  class ExampleRetriever:
10
+ """Hybrid BM25 + dense retriever over per-example documents.
 
11
 
12
  For each RAGBench example we receive a list of documents, where each
13
+ document is a list of (sentence_key, sentence_text) tuples.
14
 
15
  This retriever:
16
+ - Builds a temporary BM25 index over all documents of the example.
17
+ - Encodes each document into a dense embedding with SentenceTransformer.
18
+ - Computes a *hybrid* relevance score that is a weighted combination
19
+ of BM25 (lexical) and dense cosine similarity (semantic).
20
+ - Returns indices of the top-k most relevant documents.
21
+
22
+ This design matches the RAGBench diagram block:
23
+ Retrieval -> BM25 + LLM-Embedder -> Hybrid Search.
24
+ """ # noqa: E501
25
 
26
  def __init__(self, alpha: float = 0.5) -> None:
27
+ """Create the retriever.
28
+
29
+ Args:
30
+ alpha: Weight for the dense similarity score. The BM25 weight
31
+ is (1 - alpha). alpha=0.5 gives an even hybrid; setting
32
+ alpha=1.0 gives dense-only, alpha=0.0 gives BM25-only.
33
+ """
34
  self.embedder = SentenceTransformer(EMBEDDING_MODEL)
 
35
  self.alpha = alpha
36
 
37
  def _encode(self, texts: List[str]) -> np.ndarray:
38
+ """Encode a list of texts into dense embeddings."""
39
  return self.embedder.encode(texts, show_progress_bar=False)
40
 
41
+ def _prepare_docs(
42
+ self,
43
+ documents_sentences: List[List[Tuple[str, str]]],
44
+ ) -> Tuple[List[str], List[List[str]]]:
45
+ """Convert documents into plain text and tokenized form.
46
+
47
+ Returns:
48
+ doc_texts: one concatenated string per document.
49
+ tokenized_docs: tokenized version used for BM25.
50
+ """
51
+ doc_texts: List[str] = [
52
+ " ".join(sent for _, sent in doc) for doc in documents_sentences
53
+ ]
54
+ tokenized_docs: List[List[str]] = [
55
+ text.lower().split() for text in doc_texts
56
+ ]
57
+ return doc_texts, tokenized_docs
58
+
59
  @staticmethod
60
  def _normalize(scores: np.ndarray) -> np.ndarray:
61
+ """Normalize an array of scores to the range [0, 1].
62
+
63
+ If all scores are equal, returns a vector of 0.5 to avoid division
64
+ by zero and still allow a meaningful hybrid combination.
65
+ """
66
  if scores.size == 0:
67
  return scores
68
  s_min = float(scores.min())
69
  s_max = float(scores.max())
70
+ if s_max - s_min < 1e-12:
71
+ return np.full_like(scores, 0.5, dtype=float)
 
 
72
  return (scores - s_min) / (s_max - s_min)
73
 
74
  def rank_docs(
 
77
  documents_sentences: List[List[Tuple[str, str]]],
78
  k: int = 4,
79
  ) -> List[int]:
80
+ """Return indices of the top-k relevant documents for a question.
81
+
82
+ This uses a hybrid BM25 + dense approach over *all* documents
83
+ passed for the current example.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  """
85
  if not documents_sentences:
86
  return []
87
 
88
+ # Prepare document representations
89
+ doc_texts, tokenized_docs = self._prepare_docs(documents_sentences)
 
 
 
 
90
 
91
+ # --- BM25 sparse scores ---
92
+ bm25 = BM25Okapi(tokenized_docs)
93
+ bm25_scores = np.array(
94
+ bm25.get_scores(question.lower().split()), dtype=float
95
+ )
96
+
97
+ # --- Dense cosine similarity scores ---
98
  q_emb = self._encode([question])
99
  d_emb = self._encode(doc_texts)
100
+ dense_scores = cosine_similarity(q_emb, d_emb)[0]
 
 
 
 
 
 
 
101
 
102
+ # --- Hybrid score: weighted combination ---
 
 
 
103
  bm25_norm = self._normalize(bm25_scores)
104
+ dense_norm = self._normalize(dense_scores)
105
  hybrid_scores = self.alpha * dense_norm + (1.0 - self.alpha) * bm25_norm
106
 
107
+ # Pick top-k indices
108
+ k = min(k, len(hybrid_scores))
109
+ topk_idx = np.argsort(hybrid_scores)[::-1][:k]
 
 
110
  return topk_idx.tolist()