from typing import List, Tuple
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from rank_bm25 import BM25Okapi

from .config import EMBEDDING_MODEL

class ExampleRetriever:
    """Hybrid BM25 + dense retriever over per-example documents.

    For each RAGBench example we receive a list of documents, where each
    document is a list of (sentence_key, sentence_text) tuples.

    This retriever:
    - Builds a temporary BM25 index over all documents of the example.
    - Encodes each document into a dense embedding with SentenceTransformer.
    - Computes a *hybrid* relevance score that is a weighted combination
      of BM25 (lexical) and dense cosine similarity (semantic).
    - Returns indices of the top-k most relevant documents.

    This design matches the RAGBench diagram block:
      Retrieval -> BM25 + LLM-Embedder -> Hybrid Search.
    """  # noqa: E501

    def __init__(self, alpha: float = 0.5) -> None:
        """Create the retriever.

        Args:
            alpha: Weight for the dense similarity score. The BM25 weight
                is (1 - alpha). alpha=0.5 gives an even hybrid; setting
                alpha=1.0 gives dense-only, alpha=0.0 gives BM25-only.
        """
        self.embedder = SentenceTransformer(EMBEDDING_MODEL)
        self.alpha = alpha

    def _encode(self, texts: List[str]) -> np.ndarray:
        """Encode a list of texts into dense embeddings."""
        return self.embedder.encode(texts, show_progress_bar=False)

    def _prepare_docs(
        self,
        documents_sentences: List[List[Tuple[str, str]]],
    ) -> Tuple[List[str], List[List[str]]]:
        """Convert documents into plain text and tokenized form.

        Returns:
            doc_texts: one concatenated string per document.
            tokenized_docs: tokenized version used for BM25.
        """
        doc_texts: List[str] = [
            " ".join(sent for _, sent in doc) for doc in documents_sentences
        ]
        tokenized_docs: List[List[str]] = [
            text.lower().split() for text in doc_texts
        ]
        return doc_texts, tokenized_docs

    @staticmethod
    def _normalize(scores: np.ndarray) -> np.ndarray:
        """Normalize an array of scores to the range [0, 1].

        If all scores are equal, returns a vector of 0.5 to avoid division
        by zero and still allow a meaningful hybrid combination.
        """
        if scores.size == 0:
            return scores
        s_min = float(scores.min())
        s_max = float(scores.max())
        if s_max - s_min < 1e-12:
            return np.full_like(scores, 0.5, dtype=float)
        return (scores - s_min) / (s_max - s_min)

    def rank_docs(
        self,
        question: str,
        documents_sentences: List[List[Tuple[str, str]]],
        k: int = 4,
    ) -> List[int]:
        """Return indices of the top-k relevant documents for a question.

        This uses a hybrid BM25 + dense approach over *all* documents
        passed for the current example.
        """
        if not documents_sentences:
            return []

        # Prepare document representations
        doc_texts, tokenized_docs = self._prepare_docs(documents_sentences)

        # --- BM25 sparse scores ---
        bm25 = BM25Okapi(tokenized_docs)
        bm25_scores = np.array(
            bm25.get_scores(question.lower().split()), dtype=float
        )

        # --- Dense cosine similarity scores ---
        q_emb = self._encode([question])
        d_emb = self._encode(doc_texts)
        dense_scores = cosine_similarity(q_emb, d_emb)[0]

        # --- Hybrid score: weighted combination ---
        bm25_norm = self._normalize(bm25_scores)
        dense_norm = self._normalize(dense_scores)
        hybrid_scores = self.alpha * dense_norm + (1.0 - self.alpha) * bm25_norm

        # Pick top-k indices
        k = min(k, len(hybrid_scores))
        topk_idx = np.argsort(hybrid_scores)[::-1][:k]
        return topk_idx.tolist()