Spaces:
Running
Running
code changes as per mentor points and daigram2
Browse files- ragbench_eval/retriever.py +63 -64
ragbench_eval/retriever.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
| 1 |
from typing import List, Tuple
|
| 2 |
-
|
| 3 |
import numpy as np
|
| 4 |
from sentence_transformers import SentenceTransformer
|
| 5 |
from sklearn.metrics.pairwise import cosine_similarity
|
|
@@ -7,43 +6,69 @@ from rank_bm25 import BM25Okapi
|
|
| 7 |
|
| 8 |
from .config import EMBEDDING_MODEL
|
| 9 |
|
| 10 |
-
|
| 11 |
class ExampleRetriever:
|
| 12 |
-
"""
|
| 13 |
-
Hybrid retriever used in this project.
|
| 14 |
|
| 15 |
For each RAGBench example we receive a list of documents, where each
|
| 16 |
-
document is
|
| 17 |
|
| 18 |
This retriever:
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
def __init__(self, alpha: float = 0.5) -> None:
|
| 27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
self.embedder = SentenceTransformer(EMBEDDING_MODEL)
|
| 29 |
-
# Weight for dense vs. BM25 scores in the hybrid fusion
|
| 30 |
self.alpha = alpha
|
| 31 |
|
| 32 |
def _encode(self, texts: List[str]) -> np.ndarray:
|
| 33 |
-
"""Encode a list of texts into dense
|
| 34 |
return self.embedder.encode(texts, show_progress_bar=False)
|
| 35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
@staticmethod
|
| 37 |
def _normalize(scores: np.ndarray) -> np.ndarray:
|
| 38 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
if scores.size == 0:
|
| 40 |
return scores
|
| 41 |
s_min = float(scores.min())
|
| 42 |
s_max = float(scores.max())
|
| 43 |
-
if s_max - s_min < 1e-
|
| 44 |
-
|
| 45 |
-
# this component does not dominate the hybrid score.
|
| 46 |
-
return np.zeros_like(scores, dtype=float)
|
| 47 |
return (scores - s_min) / (s_max - s_min)
|
| 48 |
|
| 49 |
def rank_docs(
|
|
@@ -52,60 +77,34 @@ class ExampleRetriever:
|
|
| 52 |
documents_sentences: List[List[Tuple[str, str]]],
|
| 53 |
k: int = 4,
|
| 54 |
) -> List[int]:
|
| 55 |
-
"""
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
----------
|
| 60 |
-
question:
|
| 61 |
-
The user question string.
|
| 62 |
-
documents_sentences:
|
| 63 |
-
List of documents. Each document is a list of
|
| 64 |
-
(sentence_key, sentence_text) pairs.
|
| 65 |
-
k:
|
| 66 |
-
Number of top documents to return.
|
| 67 |
-
|
| 68 |
-
Returns
|
| 69 |
-
-------
|
| 70 |
-
List[int]
|
| 71 |
-
Indices of the top-k documents in `documents_sentences`,
|
| 72 |
-
sorted from most to least relevant.
|
| 73 |
"""
|
| 74 |
if not documents_sentences:
|
| 75 |
return []
|
| 76 |
|
| 77 |
-
#
|
| 78 |
-
|
| 79 |
-
# ------------------------------------------------------------------
|
| 80 |
-
doc_texts: List[str] = [
|
| 81 |
-
" ".join(sent for _, sent in doc) for doc in documents_sentences
|
| 82 |
-
]
|
| 83 |
|
| 84 |
-
#
|
| 85 |
-
|
| 86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
q_emb = self._encode([question])
|
| 88 |
d_emb = self._encode(doc_texts)
|
| 89 |
-
dense_scores = cosine_similarity(q_emb, d_emb)[0]
|
| 90 |
-
|
| 91 |
-
# ------------------------------------------------------------------
|
| 92 |
-
# 3) Sparse lexical similarity using BM25
|
| 93 |
-
# ------------------------------------------------------------------
|
| 94 |
-
tokenized_docs = [doc.split() for doc in doc_texts]
|
| 95 |
-
bm25 = BM25Okapi(tokenized_docs)
|
| 96 |
-
bm25_scores = np.array(bm25.get_scores(question.split()), dtype=float)
|
| 97 |
|
| 98 |
-
#
|
| 99 |
-
# 4) Hybrid fusion of dense + BM25 scores
|
| 100 |
-
# ------------------------------------------------------------------
|
| 101 |
-
dense_norm = self._normalize(dense_scores)
|
| 102 |
bm25_norm = self._normalize(bm25_scores)
|
| 103 |
-
|
| 104 |
hybrid_scores = self.alpha * dense_norm + (1.0 - self.alpha) * bm25_norm
|
| 105 |
|
| 106 |
-
#
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
topk = min(k, len(doc_texts))
|
| 110 |
-
topk_idx = np.argsort(hybrid_scores)[::-1][:topk]
|
| 111 |
return topk_idx.tolist()
|
|
|
|
| 1 |
from typing import List, Tuple
|
|
|
|
| 2 |
import numpy as np
|
| 3 |
from sentence_transformers import SentenceTransformer
|
| 4 |
from sklearn.metrics.pairwise import cosine_similarity
|
|
|
|
| 6 |
|
| 7 |
from .config import EMBEDDING_MODEL
|
| 8 |
|
|
|
|
| 9 |
class ExampleRetriever:
|
| 10 |
+
"""Hybrid BM25 + dense retriever over per-example documents.
|
|
|
|
| 11 |
|
| 12 |
For each RAGBench example we receive a list of documents, where each
|
| 13 |
+
document is a list of (sentence_key, sentence_text) tuples.
|
| 14 |
|
| 15 |
This retriever:
|
| 16 |
+
- Builds a temporary BM25 index over all documents of the example.
|
| 17 |
+
- Encodes each document into a dense embedding with SentenceTransformer.
|
| 18 |
+
- Computes a *hybrid* relevance score that is a weighted combination
|
| 19 |
+
of BM25 (lexical) and dense cosine similarity (semantic).
|
| 20 |
+
- Returns indices of the top-k most relevant documents.
|
| 21 |
+
|
| 22 |
+
This design matches the RAGBench diagram block:
|
| 23 |
+
Retrieval -> BM25 + LLM-Embedder -> Hybrid Search.
|
| 24 |
+
""" # noqa: E501
|
| 25 |
|
| 26 |
def __init__(self, alpha: float = 0.5) -> None:
|
| 27 |
+
"""Create the retriever.
|
| 28 |
+
|
| 29 |
+
Args:
|
| 30 |
+
alpha: Weight for the dense similarity score. The BM25 weight
|
| 31 |
+
is (1 - alpha). alpha=0.5 gives an even hybrid; setting
|
| 32 |
+
alpha=1.0 gives dense-only, alpha=0.0 gives BM25-only.
|
| 33 |
+
"""
|
| 34 |
self.embedder = SentenceTransformer(EMBEDDING_MODEL)
|
|
|
|
| 35 |
self.alpha = alpha
|
| 36 |
|
| 37 |
def _encode(self, texts: List[str]) -> np.ndarray:
|
| 38 |
+
"""Encode a list of texts into dense embeddings."""
|
| 39 |
return self.embedder.encode(texts, show_progress_bar=False)
|
| 40 |
|
| 41 |
+
def _prepare_docs(
|
| 42 |
+
self,
|
| 43 |
+
documents_sentences: List[List[Tuple[str, str]]],
|
| 44 |
+
) -> Tuple[List[str], List[List[str]]]:
|
| 45 |
+
"""Convert documents into plain text and tokenized form.
|
| 46 |
+
|
| 47 |
+
Returns:
|
| 48 |
+
doc_texts: one concatenated string per document.
|
| 49 |
+
tokenized_docs: tokenized version used for BM25.
|
| 50 |
+
"""
|
| 51 |
+
doc_texts: List[str] = [
|
| 52 |
+
" ".join(sent for _, sent in doc) for doc in documents_sentences
|
| 53 |
+
]
|
| 54 |
+
tokenized_docs: List[List[str]] = [
|
| 55 |
+
text.lower().split() for text in doc_texts
|
| 56 |
+
]
|
| 57 |
+
return doc_texts, tokenized_docs
|
| 58 |
+
|
| 59 |
@staticmethod
|
| 60 |
def _normalize(scores: np.ndarray) -> np.ndarray:
|
| 61 |
+
"""Normalize an array of scores to the range [0, 1].
|
| 62 |
+
|
| 63 |
+
If all scores are equal, returns a vector of 0.5 to avoid division
|
| 64 |
+
by zero and still allow a meaningful hybrid combination.
|
| 65 |
+
"""
|
| 66 |
if scores.size == 0:
|
| 67 |
return scores
|
| 68 |
s_min = float(scores.min())
|
| 69 |
s_max = float(scores.max())
|
| 70 |
+
if s_max - s_min < 1e-12:
|
| 71 |
+
return np.full_like(scores, 0.5, dtype=float)
|
|
|
|
|
|
|
| 72 |
return (scores - s_min) / (s_max - s_min)
|
| 73 |
|
| 74 |
def rank_docs(
|
|
|
|
| 77 |
documents_sentences: List[List[Tuple[str, str]]],
|
| 78 |
k: int = 4,
|
| 79 |
) -> List[int]:
|
| 80 |
+
"""Return indices of the top-k relevant documents for a question.
|
| 81 |
+
|
| 82 |
+
This uses a hybrid BM25 + dense approach over *all* documents
|
| 83 |
+
passed for the current example.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
"""
|
| 85 |
if not documents_sentences:
|
| 86 |
return []
|
| 87 |
|
| 88 |
+
# Prepare document representations
|
| 89 |
+
doc_texts, tokenized_docs = self._prepare_docs(documents_sentences)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
|
| 91 |
+
# --- BM25 sparse scores ---
|
| 92 |
+
bm25 = BM25Okapi(tokenized_docs)
|
| 93 |
+
bm25_scores = np.array(
|
| 94 |
+
bm25.get_scores(question.lower().split()), dtype=float
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
+
# --- Dense cosine similarity scores ---
|
| 98 |
q_emb = self._encode([question])
|
| 99 |
d_emb = self._encode(doc_texts)
|
| 100 |
+
dense_scores = cosine_similarity(q_emb, d_emb)[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
|
| 102 |
+
# --- Hybrid score: weighted combination ---
|
|
|
|
|
|
|
|
|
|
| 103 |
bm25_norm = self._normalize(bm25_scores)
|
| 104 |
+
dense_norm = self._normalize(dense_scores)
|
| 105 |
hybrid_scores = self.alpha * dense_norm + (1.0 - self.alpha) * bm25_norm
|
| 106 |
|
| 107 |
+
# Pick top-k indices
|
| 108 |
+
k = min(k, len(hybrid_scores))
|
| 109 |
+
topk_idx = np.argsort(hybrid_scores)[::-1][:k]
|
|
|
|
|
|
|
| 110 |
return topk_idx.tolist()
|