Spaces:

Rodricklw
/

api-sesgos

Sleeping

App Files Files Community

RodrickMJ commited on Dec 5, 2025

Commit

f1db1e3

1 Parent(s): e2848af

Add application file

Browse files

Files changed (13) hide show

Dockerfile +14 -0
requirements.txt +9 -0
src/__init__.py +0 -0
src/engine/__init__.py +0 -0
src/engine/analysis_engine.py +180 -0
src/index.py +68 -0
src/mineria/__init__.py +0 -0
src/mineria/mining.py +35 -0
src/semantic/__init__.py +0 -0
src/semantic/nli_distortion.py +475 -0
src/semantic/relevance.py +53 -0
src/sesgos/__init__.py +0 -0
src/sesgos/sesgos.py +140 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,14 @@

+FROM python:3.10
+RUN useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+WORKDIR /app
+COPY --chown=user ./requirements.txt requirements.txt
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+COPY --chown=user . /app
+CMD ["uvicorn", "src.index:app", "--host", "0.0.0.0", "--port", "7860"]

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+fastapi
+uvicorn[standard]
+torch>=2.1.0
+transformers>=4.36
+sentence-transformers>=2.2.2
+keybert>=0.8.0
+bitsandbytes>=0.41.0
+accelerate
+scikit-learn

src/__init__.py ADDED Viewed

File without changes

src/engine/__init__.py ADDED Viewed

File without changes

src/engine/analysis_engine.py ADDED Viewed

	@@ -0,0 +1,180 @@

+from src.semantic.relevance import SemanticRelevance
+from src.semantic.nli_distortion import DistortionDetectorNLI
+from src.sesgos.sesgos import BiasDetector
+from src.mineria.mining import MiningFeatures
+from datetime import datetime
+class AnalysisEngine:
+    def __init__(self):
+        self.relevance = SemanticRelevance()
+        self.distortion = DistortionDetectorNLI()
+        self.bias = BiasDetector()
+        self.mining = MiningFeatures()
+    def map_veredicto(self, decision: str):
+        if not decision:
+            return "indefinido"
+        decision = decision.lower()
+        if "gravemente" in decision:
+            return "distorsion"
+        if "parcial" in decision:
+            return "parcial"
+        if "neutral" in decision:
+            return "neutral"
+        return "correcto"
+    def analyze(self, user_text: str, document_paragraphs: list, title: str):
+        # 1️⃣ RELEVANCIA GENERAL
+        rel = self.relevance.relate(user_text, title, document_paragraphs)
+        # ✅ FILTRO ESPECIAL PARA RAG
+        if title == "RAG_CONTEXT":
+            best_score = rel["best_paragraph"]["score"]
+            if best_score < 0.25:
+                return {
+                    "status": "rag_irrelevante",
+                    "relevance": rel,
+                    "message": "El contexto recuperado por el RAG no es relevante al comentario del usuario.",
+                }
+        # ✅ FILTRO NORMAL PARA SCRAPING
+        if title != "RAG_CONTEXT" and rel["decision_document"] in [
+            "no relacionado",
+            "tangencial",
+        ]:
+            return {
+                "status": "poco_relevante",
+                "relevance": rel,
+            }
+        # 2️⃣ DISTORSIÓN (NLI + HEURÍSTICAS)
+        distortion = self.distortion.analyze_user_comment(
+            user_text, document_paragraphs
+        )
+        # 3️⃣ SESGOS DEL DOCUMENTO
+        full_document_text = (
+            title + ". " + " ".join(p.get("text", "") for p in document_paragraphs)
+        )
+        biases_document = self.bias.detect(full_document_text)
+        # 4️⃣ SESGOS DEL USUARIO
+        biases_user = self.bias.detect(user_text)
+        # 5️⃣ MINERÍA DE TEXTO
+        mining = self.mining.extract(user_text)
+        # ✅ ARMADO FINAL PROTEGIDO CONTRA ERRORES DE CLAVES
+        contradicciones_formateadas = []
+        for d in distortion.get("detalles", []):
+            scores = d.get("scores_detail", {}) or {}
+            contradicciones_formateadas.append(
+                {
+                    "parrafo": d.get("paragraph", ""),
+                    "oracion_usuario": d.get("sentence", ""),
+                    "claim_extraido": d.get("cleaned_claim"),
+                    "claim_transformado": d.get(
+                        "transformed_claim", d.get("cleaned_claim")
+                    ),
+                    "negacion_detectada": d.get("is_negation", False),
+                    "tipo_distorsion": d.get("best_label", "neutral"),
+                    "puntaje_principal": round(d.get("best_score", 0.0), 3),
+                    "puntajes_detallados": {
+                        "contradiccion": round(scores.get("contradiction", 0.0), 3),
+                        "neutral": round(scores.get("neutral", 0.0), 3),
+                        "coincidencia": round(scores.get("entailment", 0.0), 3),
+                    },
+                }
+            )
+        return {
+            "scraped_content": {
+                "title": title,
+                "url": "",
+                "fecha_recoleccion": datetime.utcnow().isoformat(),
+                "segmentos_contenido": [
+                    {
+                        "type": p.get("type", "p"),
+                        "text": p.get("text", ""),
+                    }
+                    for p in document_paragraphs
+                ],
+            },
+            "analisis": {
+                "document_sesgo": {
+                    "sesgos_encontrados": [
+                        {
+                            "label": s.get("sesgo", s.get("label", "desconocido")),
+                            "score": s.get("confianza", s.get("score", 0.0)),
+                        }
+                        for s in biases_document.get("sesgos_detectados", [])
+                    ],
+                    "explicacion": "Sesgos detectados en el documento mediante análisis heurístico.",
+                },
+                "user_sesgo": {
+                    "sesgos_encontrados": [
+                        {
+                            "label": s.get("sesgo", s.get("label", "desconocido")),
+                            "score": s.get("confianza", s.get("score", 0.0)),
+                        }
+                        for s in biases_user.get("sesgos_detectados", [])
+                    ],
+                    "explicacion": "Sesgos detectados en el comentario del usuario.",
+                },
+                "document_distorsion": {
+                    "veredicto": self.map_veredicto(distortion.get("decision")),
+                    "contradicciones": contradicciones_formateadas,
+                },
+                "mineria": mining,
+            },
+        }
+        # return {
+        #     "scraped_content": {
+        #         "title": title,
+        #         "url": "",
+        #         "fecha_recoleccion": datetime.utcnow().isoformat(),
+        #         "segmentos_contenido": [
+        #             {"type": p["type"], "text": p["text"]} for p in document_paragraphs
+        #         ],
+        #     },
+        #     "analisis": {
+        #         "document_sesgo": {
+        #             "sesgos_encontrados": [
+        #                 {
+        #                     "label": s.get("sesgo", s.get("label", "desconocido")),
+        #                     "score": s.get("confianza", s.get("score", 0.0)),
+        #                 }
+        #                 for s in biases["sesgos_detectados"]
+        #             ],
+        #             "explicacion": "Sesgos detectados mediante análisis heurístico y de objetividad.",
+        #         },
+        #         "document_distorsion": {
+        #             "veredicto": self.map_veredicto(distortion["decision"]),
+        #             "contradicciones": [
+        #                 {
+        #                     "parrafo": d["paragraph"],
+        #                     "oracion_usuario": d["sentence"],
+        #                     "claim_extraido": d["cleaned_claim"],
+        #                     "claim_transformado": d.get("transformed_claim", d["cleaned_claim"]),
+        #                     "negacion_detectada": d["is_negation"],
+        #                     "tipo_distorsion": d["best_label"],
+        #                     "puntaje_principal": d["best_score"],
+        #                     "puntajes_detallados": {
+        #                         "contradiccion": round(d["scores_detail"]["contradiction"], 3),
+        #                         "neutral": round(d["scores_detail"]["neutral"], 3),
+        #                         "coincidencia": round(d["scores_detail"]["entailment"], 3),
+        #                     },
+        #                 }
+        #                 for d in distortion["detalles"]
+        #             ],
+        #         },
+        #         "mineria": mining,
+        #     },
+        # }

src/index.py ADDED Viewed

	@@ -0,0 +1,68 @@

+from fastapi import FastAPI
+from pydantic import BaseModel
+from typing import List
+from contextlib import asynccontextmanager
+from src.engine.analysis_engine import AnalysisEngine
+engine = None
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    global engine
+    engine = AnalysisEngine()
+    yield
+app = FastAPI(lifespan=lifespan, title="FactChecker Ultra-Ligero")
+class Paragraph(BaseModel):
+    type: str
+    text: str
+class AnalysisRequest(BaseModel):
+    title: str
+    paragraphs: List[Paragraph]
+    user_text: str
+class RagItem(BaseModel):
+    summary: str
+class RagAnalysisRequest(BaseModel):
+    rag_results: List[RagItem]
+    user_text: str
+@app.get("/")
+def home():
+    return {"status": "corriendo", "modelos_cargados": "una sola vez"}
+@app.post("/analyze")
+async def analyze(payload: AnalysisRequest):
+    result = engine.analyze(
+        user_text=payload.user_text,
+        document_paragraphs=[p.dict() for p in payload.paragraphs],
+        title=payload.title,
+    )
+    return result
+@app.post("/analyze-rag")
+async def analyze_rag(payload: RagAnalysisRequest):
+    # 1. Convertir summaries a párrafos estándar del analizador
+    rag_paragraphs = [
+        {
+            "type": "rag_summary",
+            "text": item.summary
+        }
+        for item in payload.rag_results
+    ]
+    # 2. Ejecutar el MISMO engine
+    result = engine.analyze(
+        user_text=payload.user_text,
+        document_paragraphs=rag_paragraphs,
+        title="RAG_CONTEXT"
+    )
+    # 3. Marcar que viene de RAG (trazabilidad futura)
+    result["source"] = "rag"
+    return result

src/mineria/__init__.py ADDED Viewed

File without changes

src/mineria/mining.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from transformers import pipeline
+from keybert import KeyBERT
+from sentence_transformers import SentenceTransformer
+class MiningFeatures:
+    def __init__(self):
+        self.sentiment = pipeline(
+            "sentiment-analysis",
+            model="nlptown/bert-base-multilingual-uncased-sentiment",
+        )
+        embedder = SentenceTransformer(
+            "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
+        )
+        self.kw_model = KeyBERT(model=embedder)
+    def extract(self, text: str):
+        sent = self.sentiment(text[:512])[0]
+        label = (
+            "NEG"
+            if "1 star" in sent["label"] or "2 stars" in sent["label"]
+            else "NEU" if "3 stars" in sent["label"] else "POS"
+        )
+        keywords = self.kw_model.extract_keywords(
+            text,
+            keyphrase_ngram_range=(1, 3),
+            stop_words=["de", "la", "que", "el", "en", "y", "a", "los", "un", "una"],
+            top_n=8,
+        )
+        return {
+            "sentiment": {"label": label, "score": round(sent["score"], 3)},
+            "keywords": [k[0] for k in keywords],
+        }

src/semantic/__init__.py ADDED Viewed

File without changes

src/semantic/nli_distortion.py ADDED Viewed

	@@ -0,0 +1,475 @@

+# src/semantic/nli_distortion.py
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+import torch
+from typing import List, Dict, Optional
+import re
+# Embeddings
+from sentence_transformers import SentenceTransformer, util
+class DistortionDetectorNLI:
+    def __init__(
+        self,
+        model_name: str = "MoritzLaurer/deberta-v3-base-mnli-fever-anli",
+        emb_model_name: str = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
+        similarity_threshold: float = 0.25,
+        contradiction_threshold: float = 0.55,
+    ):
+        # NLI model (tu modelo)
+        self.model_name = model_name
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModelForSequenceClassification.from_pretrained(
+            model_name, load_in_4bit=True, device_map="auto", torch_dtype=torch.float16
+        )
+        self.labels = ["contradiction", "neutral", "entailment"]
+        # Embedding model para matching semántico
+        self.embedder = SentenceTransformer(emb_model_name)
+        # parámetros
+        self.similarity_threshold = similarity_threshold
+        self.contradiction_threshold = contradiction_threshold
+        # Heurísticas originales (copiadas/adaptadas)
+        self.negative_claims = [
+            r"goteras?",
+            r"sin\s+equipo",
+            r"no\s+funciona",
+            r"nunca\s+ha\s+funcionado",
+            r"descompuesto",
+            r"cartón",
+            r"madera\s+pintada",
+            r"simular\s+equipo",
+            r"maquetas?",
+            r"hospitales\s+privados",
+            r"se\s+inunda",
+            r"filtraciones",
+            r"vicios\s+ocultos",
+        ]
+        self.positive_indicators_doc = [
+            r"inauguró",
+            r"inversión\s+de\s+\d+",
+            r"cuenta\s+con\s+\d+",
+            r"atención\s+de\s+calidad",
+            r"servicios.*avanzados",
+            r"han\s+otorgado\s+\d+",
+        ]
+        self.scandal_keywords = [
+            r"escándalo",
+            r"vicios\s+ocultos",
+            r"bombo\s+y\s+platillo",
+            r"maquillaje",
+        ]
+    # ---------------------------
+    # UTIL: limpieza básica
+    # ---------------------------
+    def normalize(self, text: str) -> str:
+        if not text:
+            return ""
+        text = text.lower().strip()
+        text = re.sub(r"\s+", " ", text)
+        return text
+    # ---------------------------
+    # UTIL: detector de sarcasmo ligero
+    # ---------------------------
+    def detect_sarcasm(self, text: str) -> bool:
+        if not text:
+            return False
+        marcas = [
+            "felicidades",
+            "qué orgullo",
+            "claro que sí",
+            "ajá",
+            "sí cómo no",
+            "bravo",
+            "qué bien",
+            "maravilloso",
+            "genial pero",
+            "qué orgullo",
+        ]
+        t = text.lower()
+        return any(m in t for m in marcas)
+    # ---------------------------
+    # UTIL: NLI inference (mantiene tu infer_nli)
+    # ---------------------------
+    def infer_nli(self, premise: str, hypothesis: str):
+        inputs = self.tokenizer(
+            premise, hypothesis, return_tensors="pt", truncation=True, max_length=512
+        ).to(self.model.device)
+        with torch.no_grad():
+            logits = self.model(**inputs).logits
+        probs = torch.softmax(logits, dim=1)[0].cpu()
+        idx = probs.argmax().item()
+        return {
+            "label": self.labels[idx],
+            "score": float(probs[idx]),
+            "scores": {
+                "contradiction": float(probs[0]),
+                "neutral": float(probs[1]),
+                "entailment": float(probs[2]),
+            },
+        }
+    # ---------------------------
+    # Heurística original (la preservamos)
+    # ---------------------------
+    def detect_contradiction_by_content(self, user_claim: str, document_text: str) -> dict:
+        user_lower = user_claim.lower()
+        doc_lower = document_text.lower()
+        neg_count = sum(1 for pattern in self.negative_claims if re.search(pattern, user_lower))
+        pos_count = sum(1 for pattern in self.positive_indicators_doc if re.search(pattern, doc_lower))
+        # HEURÍSTICA 1
+        if neg_count >= 2 and pos_count >= 2:
+            problems_in_doc = sum(1 for pattern in self.negative_claims if re.search(pattern, doc_lower))
+            if problems_in_doc == 0:
+                confidence = min(0.95, 0.70 + (neg_count * 0.08))
+                return {
+                    "is_contradiction": True,
+                    "confidence": confidence,
+                    "reason": f"Usuario afirma {neg_count} problemas no mencionados en documento positivo",
+                    "method": "heuristic_negative_claims",
+                }
+        # HEURÍSTICA 2: negación de datos numéricos
+        doc_numbers = re.findall(r"\d+", doc_lower)
+        if doc_numbers and any(num in user_lower for num in doc_numbers):
+            negation_context = [
+                r"en\s+realidad\s+(a|fueron)",
+                r"corresponden\s+(a|en)",
+                r"nunca\s+ha",
+                r"inflar\s+las\s+cifras",
+            ]
+            if any(re.search(pattern, user_lower) for pattern in negation_context):
+                return {
+                    "is_contradiction": True,
+                    "confidence": 0.88,
+                    "reason": "Usuario niega datos numéricos específicos del documento",
+                    "method": "heuristic_data_negation",
+                }
+        # HEURÍSTICA 3: Tono de escándalo vs documento positivo
+        scandal_count = sum(1 for pattern in self.scandal_keywords if re.search(pattern, user_lower))
+        if scandal_count >= 1 and pos_count >= 2:
+            return {
+                "is_contradiction": True,
+                "confidence": 0.82,
+                "reason": "Tono de escándalo contradice documento positivo",
+                "method": "heuristic_tone_mismatch",
+            }
+        return {"is_contradiction": False, "confidence": 0.0, "reason": "No se detectó contradicción heurística", "method": "none"}
+    # ---------------------------
+    # UTIL: seleccionar mejor párrafo por similitud semántica
+    # ---------------------------
+    def best_paragraph_by_similarity(self, claim: str, document_paragraphs: List[Dict[str, str]]) -> Dict:
+        """
+        Devuelve tuple: (best_paragraph, best_score)
+        Si no pasa el umbral, devuelve (None, best_score)
+        """
+        if not document_paragraphs:
+            return None, 0.0
+        # crear lista de textos
+        texts = [p["text"] for p in document_paragraphs]
+        # computar embeddings (normalizados internamente por sentence-transformers)
+        emb_claim = self.embedder.encode(claim, convert_to_tensor=True, normalize_embeddings=True)
+        emb_texts = self.embedder.encode(texts, convert_to_tensor=True, normalize_embeddings=True)
+        cosine_scores = util.cos_sim(emb_claim, emb_texts)[0].cpu().tolist()
+        best_idx = int(max(range(len(cosine_scores)), key=lambda i: cosine_scores[i]))
+        best_score = float(cosine_scores[best_idx])
+        best_paragraph = document_paragraphs[best_idx]
+        # si no alcanza umbral, devolvemos None
+        if best_score < self.similarity_threshold:
+            return None, best_score
+        return best_paragraph, best_score
+    # ---------------------------
+    # Extraer y transformar claim (usamos tu función)
+    # ---------------------------
+    def is_verifiable_claim(self, sentence: str) -> bool:
+        sentence_lower = sentence.lower().strip()
+        non_verifiable_patterns = [
+            r"^nadie\s+(cree|piensa)",
+            r"^todos\s+(saben|creen)",
+            r"^es\s+(mentira|falso|propaganda)\s*$",
+            r"^eso\s+es\s+(mentira|falso)",
+            r"^\w+\s+cree\s+",
+            r"^\w+\s+piensa\s+",
+            r"^otros\s+recuerdan\s+que.*y\s+preguntan",
+            r'^usuarios.*comentan.*["\'"].*["\'"]$',
+        ]
+        if any(re.match(pattern, sentence_lower) for pattern in non_verifiable_patterns):
+            return False
+        if sentence_lower.endswith("?") and "si también llevará" in sentence_lower:
+            return False
+        has_factual_content = (
+            re.search(r"\d+", sentence)
+            or re.search(r"%", sentence)
+            or re.search(r"\b(aumentado|disminuido|reducido|incrementado|subido|bajado)\b", sentence, re.IGNORECASE)
+            or re.search(r"\b(hospital|quirófano|equipo|camas|médicos|estudios|goteras|escándalo)\b", sentence, re.IGNORECASE)
+        )
+        return bool(has_factual_content)
+    def extract_and_transform_claim(self, sentence: str) -> tuple:
+        """Reutiliza tu lógica original para extraer y transformar claims verificables."""
+        if not self.is_verifiable_claim(sentence):
+            return None, False, None
+        # Conservamos la misma lógica de extracción y transformaciones
+        direct_negation = [
+            r"pero\s+(eso\s+)?es\s+falso",
+            r"es\s+mentira",
+            r"es\s+falso",
+            r"no\s+es\s+(cierto|verdad)",
+        ]
+        indirect_negation = [
+            r"es\s+pura\s+propaganda",
+            r"es\s+propaganda",
+            r"manipulan\s+(las\s+)?cifras",
+            r"manipulan\s+(los\s+)?datos",
+            r"(ese\s+)?supuesto\s+\d+%",
+            r"inflar\s+las\s+cifras",
+            r"en\s+realidad\s+a",
+            r"corresponden\s+en\s+realidad",
+            r"para\s+simular\s+equipo",
+            r"maquillaje\s+de\s+la",
+        ]
+        citation_with_negation = [
+            r"presumió.*en\s+realidad",
+            r"que\s+dice.*pero\s+en\s+realidad",
+        ]
+        is_negation = (
+            any(re.search(pattern, sentence, re.IGNORECASE) for pattern in direct_negation)
+            or any(re.search(pattern, sentence, re.IGNORECASE) for pattern in indirect_negation)
+            or any(re.search(pattern, sentence, re.IGNORECASE) for pattern in citation_with_negation)
+        )
+        claim = sentence
+        meta_comment_patterns = [
+            r'^usuarios\s+irónicos\s+comentan:\s*["\'"]?',
+            r"^otros\s+recuerdan\s+que\s+",
+            r"^fuentes\s+internas.*revelaron\s+que\s+",
+            r"^trabajadores\s+aseguran\s+que\s+",
+            r'^.*(comentan|afirman|aseguran|dicen):\s*["\'"]?',
+        ]
+        for pattern in meta_comment_patterns:
+            claim = re.sub(pattern, "", claim, flags=re.IGNORECASE)
+        claim = re.sub(
+            r"^.*(dice\s+que|afirma\s+que|según|de\s+acuerdo\s+con|informó\s+que|anunció\s+que)\s+",
+            "",
+            claim,
+            flags=re.IGNORECASE,
+        )
+        claim = re.sub(
+            r"^(todos|nadie)\s+(sabemos|saben|cree|creen)\s+que\s+",
+            "",
+            claim,
+            flags=re.IGNORECASE,
+        )
+        claim = re.sub(r"^(ese\s+)?supuesto\s+", "", claim, flags=re.IGNORECASE)
+        claim = re.sub(r"\s+(es\s+)?(pura\s+)?propaganda.*$", "", claim, flags=re.IGNORECASE)
+        claim = re.sub(r"\s+(pero|y)?\s*(eso\s+)?(es\s+)?(falso|mentira|incorrecto).*$", "", claim, flags=re.IGNORECASE)
+        claim = re.sub(r"\s+y\s+que\s+(manipulan|mienten|ocultan).*$", "", claim, flags=re.IGNORECASE)
+        claim = re.sub(r"\s+y\s+preguntan\s+si.*$", "", claim, flags=re.IGNORECASE)
+        claim = re.sub(r'["\'"]$', "", claim)
+        claim = claim.strip()
+        original_claim = claim
+        if len(claim) < 10 or not re.search(r"\d|aumenta|disminuye|reduce|violencia|hospital|quirófano|equipo|goteras|escándalo",
+                                           claim, re.IGNORECASE):
+            return None, False, None
+        if is_negation:
+            transformations = {
+                r"\b(redujeron|disminuyeron|bajaron|descendieron|redujo|disminuyó|reducción|disminución)\b": "aumentaron",
+                r"\b(aumentaron|incrementaron|subieron|crecieron|aumentó|incrementó|aumento|incremento)\b": "disminuyeron",
+                r"\bcorresponden\s+en\s+realidad\s+a\s+hospitales\s+privados\b": "fueron realizados en el hospital IMSS",
+                r"\bpara\s+simular\s+equipo\b": "con equipo real",
+            }
+            claim_transformed = claim
+            for pattern, replacement in transformations.items():
+                if re.search(pattern, claim, re.IGNORECASE):
+                    claim_transformed = re.sub(pattern, replacement, claim, flags=re.IGNORECASE)
+                    break
+            if claim_transformed == claim:
+                claim_transformed = f"No es cierto que {claim}"
+            claim = claim_transformed
+        elif re.search(r"\b(la\s+)?violencia\s+(ha\s+)?(aumentado|aumentó|incrementó|subió)\b", sentence, re.IGNORECASE):
+            is_negation = True
+            match = re.search(r"\b(la\s+)?violencia\s+(ha\s+)?(aumentado|aumentó|incrementó|subió)\b", sentence, re.IGNORECASE)
+            if match:
+                claim = match.group(0)
+                original_claim = claim
+        return claim, is_negation, original_claim
+    # ---------------------------
+    # MAIN: analizar comentario del usuario (mejorado)
+    # ---------------------------
+    def analyze_user_comment(self, user_text: str, document_paragraphs: List[Dict[str, str]]):
+        # split or sentence segmentation simple
+        sentences = []
+        temp_sentences = re.split(r"\.\s+|\n+", user_text)
+        for temp_s in temp_sentences:
+            if ";" in temp_s and len(temp_s) > 150:
+                sub_sentences = temp_s.split(";")
+                sentences.extend([s.strip() for s in sub_sentences if s.strip() and len(s.strip()) > 10])
+            else:
+                if temp_s.strip() and len(temp_s.strip()) > 10:
+                    sentences.append(temp_s.strip())
+        results = []
+        contradictions = 0
+        for s in sentences:
+            s_clean = re.sub(r'^[–—\-"\'"]+\s*', "", s)
+            transformed_claim, is_negation, original_claim = self.extract_and_transform_claim(s_clean)
+            if transformed_claim is None:
+                # no es claim verificable → seguimos a siguiente
+                print(f"[DEBUG] ⚠️  Oración filtrada: '{s_clean[:80]}...'")
+                continue
+            # SARCASTIC CHECK
+            if self.detect_sarcasm(s_clean):
+                print("[DEBUG] Detectado sarcasmo/opinión: salto NLI")
+                results.append({
+                    "sentence": s_clean,
+                    "cleaned_claim": original_claim,
+                    "transformed_claim": transformed_claim,
+                    "is_negation": is_negation,
+                    "best_label": "opinion_sarcastica",
+                    "best_score": 1.0,
+                    "detection_method": "sarcasm_detector",
+                    "paragraph": None,
+                    "scores_detail": {}
+                })
+                continue
+            # Selección del párrafo más relevante SEMÁNTICAMENTE
+            best_p, sim_score = self.best_paragraph_by_similarity(transformed_claim, document_paragraphs)
+            if best_p is None:
+                # No hay párrafo suficientemente similar; no forzamos NLI
+                print(f"[DEBUG] No se encontró párrafo relevante (sim_score={sim_score:.3f}) para: '{original_claim[:80]}...'")
+                results.append({
+                    "sentence": s_clean,
+                    "cleaned_claim": original_claim,
+                    "transformed_claim": transformed_claim,
+                    "is_negation": is_negation,
+                    "best_label": "sin_relacion_factica",
+                    "best_score": round(sim_score, 3),
+                    "detection_method": "semantic_filter",
+                    "paragraph": None,
+                    "scores_detail": {}
+                })
+                continue
+            # Ejecutar el NLI entre el párrafo más relevante y el claim transformado
+            nli_result = self.infer_nli(best_p["text"], transformed_claim)
+            print(f"\n[DEBUG] ✓ Oración: '{s_clean[:100]}...'")
+            print(f"[DEBUG] Claim: '{transformed_claim[:100]}...'")
+            print(f"[DEBUG] Sim. párrafo: {sim_score:.3f}")
+            print(f"[DEBUG] NLI: {nli_result['label']} ({nli_result['score']:.3f})")
+            final_label = nli_result["label"]
+            final_scores = nli_result["scores"]
+            detection_method = "nli"
+            # Estrategia híbrida: usar heurística si NLI neutro/entailment con score alto
+            if (nli_result["label"] in ["neutral", "entailment"] and nli_result["score"] > 0.65):
+                heuristic = self.detect_contradiction_by_content(s_clean, best_p["text"])
+                if heuristic["is_contradiction"]:
+                    final_label = "contradiction"
+                    final_scores = {
+                        "contradiction": heuristic["confidence"],
+                        "neutral": 1 - heuristic["confidence"],
+                        "entailment": 0.0,
+                    }
+                    detection_method = heuristic["method"]
+            # Fallback: si claim es negación pero NLI devuelve entailment, invertimos
+            if is_negation and nli_result["label"] == "entailment":
+                final_label = "contradiction"
+                final_scores = {
+                    "contradiction": nli_result["scores"]["entailment"],
+                    "neutral": nli_result["scores"]["neutral"],
+                    "entailment": nli_result["scores"]["contradiction"],
+                }
+                detection_method = "nli_fallback"
+            # Decisión robusta: solo considerar contradicción si supera umbral
+            if final_scores["contradiction"] >= self.contradiction_threshold:
+                considered_label = "contradiction"
+            elif final_scores["entailment"] >= 0.55:
+                considered_label = "entailment"
+            else:
+                considered_label = "neutral"
+            results.append({
+                "sentence": s_clean,
+                "cleaned_claim": original_claim,
+                "transformed_claim": transformed_claim,
+                "is_negation": is_negation,
+                "best_label": considered_label,
+                "best_score": round(max(final_scores.values()) if final_scores else 0.0, 3),
+                "detection_method": detection_method,
+                "paragraph": (best_p["text"][:200] + "...") if len(best_p["text"]) > 200 else best_p["text"],
+                "scores_detail": {
+                    "contradiction": round(final_scores.get("contradiction", 0.0), 3),
+                    "neutral": round(final_scores.get("neutral", 0.0), 3),
+                    "entailment": round(final_scores.get("entailment", 0.0), 3),
+                }
+            })
+            if considered_label == "contradiction":
+                contradictions += 1
+        total = len(results)
+        if total == 0:
+            decision = "sin suficiente información"
+        elif contradictions / total > 0.5:
+            decision = "distorsiona gravemente"
+        elif contradictions > 0:
+            decision = "distorsiona parcialmente"
+        else:
+            decision = "fiel o parcial"
+        return {
+            "decision": decision,
+            "contradicciones": contradictions,
+            "total_oraciones": total,
+            "detalles": results,
+        }

src/semantic/relevance.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from sentence_transformers import SentenceTransformer, util
+from typing import List, Dict
+MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
+class SemanticRelevance:
+    def __init__(self):
+        self.model = SentenceTransformer(MODEL_NAME)
+    def relate(self, user_text: str, title: str, paragraphs: List[Dict[str, str]]):
+        full_doc = title + ". " + " ".join(p["text"] for p in paragraphs)
+        emb_user = self.model.encode(
+            user_text, normalize_embeddings=True, convert_to_tensor=True
+        )
+        emb_doc = self.model.encode(
+            full_doc, normalize_embeddings=True, convert_to_tensor=True
+        )
+        score_doc = float(util.cos_sim(emb_user, emb_doc)[0][0])
+        per_paragraph = []
+        for i, p in enumerate(paragraphs):
+            emb_p = self.model.encode(
+                p["text"], normalize_embeddings=True, convert_to_tensor=True
+            )
+            score = float(util.cos_sim(emb_user, emb_p)[0][0])
+            per_paragraph.append(
+                {
+                    "index": i,
+                    "type": p["type"],
+                    "text": p["text"],
+                    "score": round(score, 3),
+                }
+            )
+        if score_doc >= 0.48:
+            decision = "muy relacionado"
+        elif score_doc >= 0.35:
+            decision = "relacionado"
+        elif score_doc >= 0.20:
+            decision = "tangencial"
+        else:
+            decision = "no relacionado"
+        best_p = max(per_paragraph, key=lambda x: x["score"])
+        return {
+            "decision_document": decision,
+            "score_document": round(score_doc, 3),
+            "best_paragraph": best_p,
+            "per_paragraph": per_paragraph,
+        }

src/sesgos/__init__.py ADDED Viewed

File without changes

src/sesgos/sesgos.py ADDED Viewed

	@@ -0,0 +1,140 @@

+from transformers import pipeline
+from typing import List, Dict
+class BiasDetector:
+    def __init__(self):
+        # Modelo para análisis de sentimiento (opcional)
+        self.classifier = pipeline(
+            "text-classification",
+            model="citizenlab/twitter-xlm-roberta-base-sentiment-finetunned",
+            device=0,
+            top_k=None,
+        )
+        # Indicadores heurísticos de sesgos
+        self.bias_keywords = {
+            "apelación a la autoridad": [
+                "según", "de acuerdo con", "informó", "afirmó", "señaló",
+                "SESNSP", "gobierno", "autoridades", "expertos dicen", "oficialmente"
+            ],
+            "generalización apresurada": [
+                "todos", "nadie", "siempre", "nunca", "en todos los casos",
+                "absolutamente", "todos sabemos", "todo el mundo", "cualquiera sabe",
+                "varios de los",  # ✅ NUEVO
+            ],
+            "razonamiento emocional": [
+                "terrible", "desastroso", "maravilloso", "increíble",
+                "!!", "alarmante", "preocupante extremadamente", "indignante",
+                "pura propaganda", "mentiras", "mentira", "obviamente falso",
+                "escándalo", "bombo y platillo", "vicios ocultos",  # ✅ NUEVO
+                "maquillaje", "irónicos comentan", "ironía"  # ✅ NUEVO
+            ],
+            "falso dilema": [
+                "solo hay dos opciones", "o esto o aquello",
+                "no hay alternativa", "única solución", "o... o..."
+            ],
+            "ad hominem": [
+                "manipulan", "manipula", "mienten", "miente",
+                "corruptos", "corrupto", "deshonestos", "deshonesto",
+                "inflar las cifras", "para la foto"  # ✅ NUEVO
+            ]
+        }
+    def detect_heuristic_biases(self, text: str) -> List[Dict]:
+        """Detecta sesgos usando keywords y patrones"""
+        detected = []
+        text_lower = text.lower()
+        for bias_type, keywords in self.bias_keywords.items():
+            matches = sum(1 for kw in keywords if kw.lower() in text_lower)
+            if matches >= 2:  # Al menos 2 keywords
+                confidence = min(0.95, 0.60 + (matches * 0.10))
+                detected.append(
+                    {
+                        "sesgo": bias_type,
+                        "confianza": round(confidence, 3),
+                        "metodo": "heurístico",
+                    }
+                )
+        return detected
+    def analyze_objectivity(self, text: str) -> Dict:
+        """
+        Analiza si el texto es objetivo o subjetivo.
+        """
+        # Indicadores de objetividad
+        objectivity_markers = [
+            "según",
+            "de acuerdo con",
+            "el informe señala",
+            "las cifras muestran",
+            "los datos indican",
+            "sin embargo",
+            "por otro lado",
+            "analistas",
+            "organizaciones",
+        ]
+        # Indicadores de subjetividad
+        subjectivity_markers = [
+            "obviamente",
+            "claramente",
+            "sin duda",
+            "es evidente que",
+            "todos saben",
+            "la verdad es",
+        ]
+        text_lower = text.lower()
+        obj_score = sum(1 for m in objectivity_markers if m in text_lower)
+        subj_score = sum(1 for m in subjectivity_markers if m in text_lower)
+        return {
+            "objetivo": obj_score,
+            "subjetivo": subj_score,
+            "ratio": obj_score / max(subj_score, 1),
+        }
+    def detect(self, text: str):
+        """
+        Método principal de detección de sesgos.
+        """
+        # Análisis heurístico
+        heuristic_biases = self.detect_heuristic_biases(text)
+        # Análisis de objetividad
+        objectivity = self.analyze_objectivity(text)
+        # Si el texto es muy objetivo, reducir confianza en sesgos
+        if objectivity["ratio"] > 3 and len(heuristic_biases) > 0:
+            for bias in heuristic_biases:
+                bias["confianza"] = max(0.60, bias["confianza"] - 0.20)
+        # Si no hay sesgos detectados
+        if not heuristic_biases:
+            if objectivity["ratio"] > 2:
+                return {
+                    "sesgos_detectados": [
+                        {
+                            "sesgo": "texto objetivo",
+                            "confianza": 1.0,
+                            "metodo": "análisis de objetividad",
+                        }
+                    ]
+                }
+            else:
+                return {
+                    "sesgos_detectados": [
+                        {
+                            "sesgo": "sin sesgos detectables",
+                            "confianza": 1.0,
+                            "metodo": "análisis heurístico",
+                        }
+                    ]
+                }
+        return {"sesgos_detectados": heuristic_biases}