Spaces:
Sleeping
Sleeping
RodrickMJ
commited on
Commit
·
f1db1e3
1
Parent(s):
e2848af
Add application file
Browse files- Dockerfile +14 -0
- requirements.txt +9 -0
- src/__init__.py +0 -0
- src/engine/__init__.py +0 -0
- src/engine/analysis_engine.py +180 -0
- src/index.py +68 -0
- src/mineria/__init__.py +0 -0
- src/mineria/mining.py +35 -0
- src/semantic/__init__.py +0 -0
- src/semantic/nli_distortion.py +475 -0
- src/semantic/relevance.py +53 -0
- src/sesgos/__init__.py +0 -0
- src/sesgos/sesgos.py +140 -0
Dockerfile
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10
|
| 2 |
+
|
| 3 |
+
RUN useradd -m -u 1000 user
|
| 4 |
+
USER user
|
| 5 |
+
ENV PATH="/home/user/.local/bin:$PATH"
|
| 6 |
+
|
| 7 |
+
WORKDIR /app
|
| 8 |
+
|
| 9 |
+
COPY --chown=user ./requirements.txt requirements.txt
|
| 10 |
+
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
| 11 |
+
|
| 12 |
+
COPY --chown=user . /app
|
| 13 |
+
|
| 14 |
+
CMD ["uvicorn", "src.index:app", "--host", "0.0.0.0", "--port", "7860"]
|
requirements.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi
|
| 2 |
+
uvicorn[standard]
|
| 3 |
+
torch>=2.1.0
|
| 4 |
+
transformers>=4.36
|
| 5 |
+
sentence-transformers>=2.2.2
|
| 6 |
+
keybert>=0.8.0
|
| 7 |
+
bitsandbytes>=0.41.0
|
| 8 |
+
accelerate
|
| 9 |
+
scikit-learn
|
src/__init__.py
ADDED
|
File without changes
|
src/engine/__init__.py
ADDED
|
File without changes
|
src/engine/analysis_engine.py
ADDED
|
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from src.semantic.relevance import SemanticRelevance
|
| 2 |
+
from src.semantic.nli_distortion import DistortionDetectorNLI
|
| 3 |
+
from src.sesgos.sesgos import BiasDetector
|
| 4 |
+
from src.mineria.mining import MiningFeatures
|
| 5 |
+
from datetime import datetime
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class AnalysisEngine:
|
| 9 |
+
def __init__(self):
|
| 10 |
+
self.relevance = SemanticRelevance()
|
| 11 |
+
self.distortion = DistortionDetectorNLI()
|
| 12 |
+
self.bias = BiasDetector()
|
| 13 |
+
self.mining = MiningFeatures()
|
| 14 |
+
|
| 15 |
+
def map_veredicto(self, decision: str):
|
| 16 |
+
if not decision:
|
| 17 |
+
return "indefinido"
|
| 18 |
+
|
| 19 |
+
decision = decision.lower()
|
| 20 |
+
|
| 21 |
+
if "gravemente" in decision:
|
| 22 |
+
return "distorsion"
|
| 23 |
+
if "parcial" in decision:
|
| 24 |
+
return "parcial"
|
| 25 |
+
if "neutral" in decision:
|
| 26 |
+
return "neutral"
|
| 27 |
+
|
| 28 |
+
return "correcto"
|
| 29 |
+
|
| 30 |
+
def analyze(self, user_text: str, document_paragraphs: list, title: str):
|
| 31 |
+
# 1️⃣ RELEVANCIA GENERAL
|
| 32 |
+
rel = self.relevance.relate(user_text, title, document_paragraphs)
|
| 33 |
+
|
| 34 |
+
# ✅ FILTRO ESPECIAL PARA RAG
|
| 35 |
+
if title == "RAG_CONTEXT":
|
| 36 |
+
best_score = rel["best_paragraph"]["score"]
|
| 37 |
+
if best_score < 0.25:
|
| 38 |
+
return {
|
| 39 |
+
"status": "rag_irrelevante",
|
| 40 |
+
"relevance": rel,
|
| 41 |
+
"message": "El contexto recuperado por el RAG no es relevante al comentario del usuario.",
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
# ✅ FILTRO NORMAL PARA SCRAPING
|
| 45 |
+
if title != "RAG_CONTEXT" and rel["decision_document"] in [
|
| 46 |
+
"no relacionado",
|
| 47 |
+
"tangencial",
|
| 48 |
+
]:
|
| 49 |
+
return {
|
| 50 |
+
"status": "poco_relevante",
|
| 51 |
+
"relevance": rel,
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
# 2️⃣ DISTORSIÓN (NLI + HEURÍSTICAS)
|
| 55 |
+
distortion = self.distortion.analyze_user_comment(
|
| 56 |
+
user_text, document_paragraphs
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
# 3️⃣ SESGOS DEL DOCUMENTO
|
| 60 |
+
full_document_text = (
|
| 61 |
+
title + ". " + " ".join(p.get("text", "") for p in document_paragraphs)
|
| 62 |
+
)
|
| 63 |
+
biases_document = self.bias.detect(full_document_text)
|
| 64 |
+
|
| 65 |
+
# 4️⃣ SESGOS DEL USUARIO
|
| 66 |
+
biases_user = self.bias.detect(user_text)
|
| 67 |
+
|
| 68 |
+
# 5️⃣ MINERÍA DE TEXTO
|
| 69 |
+
mining = self.mining.extract(user_text)
|
| 70 |
+
|
| 71 |
+
# ✅ ARMADO FINAL PROTEGIDO CONTRA ERRORES DE CLAVES
|
| 72 |
+
contradicciones_formateadas = []
|
| 73 |
+
|
| 74 |
+
for d in distortion.get("detalles", []):
|
| 75 |
+
scores = d.get("scores_detail", {}) or {}
|
| 76 |
+
|
| 77 |
+
contradicciones_formateadas.append(
|
| 78 |
+
{
|
| 79 |
+
"parrafo": d.get("paragraph", ""),
|
| 80 |
+
"oracion_usuario": d.get("sentence", ""),
|
| 81 |
+
"claim_extraido": d.get("cleaned_claim"),
|
| 82 |
+
"claim_transformado": d.get(
|
| 83 |
+
"transformed_claim", d.get("cleaned_claim")
|
| 84 |
+
),
|
| 85 |
+
"negacion_detectada": d.get("is_negation", False),
|
| 86 |
+
"tipo_distorsion": d.get("best_label", "neutral"),
|
| 87 |
+
"puntaje_principal": round(d.get("best_score", 0.0), 3),
|
| 88 |
+
"puntajes_detallados": {
|
| 89 |
+
"contradiccion": round(scores.get("contradiction", 0.0), 3),
|
| 90 |
+
"neutral": round(scores.get("neutral", 0.0), 3),
|
| 91 |
+
"coincidencia": round(scores.get("entailment", 0.0), 3),
|
| 92 |
+
},
|
| 93 |
+
}
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
+
return {
|
| 97 |
+
"scraped_content": {
|
| 98 |
+
"title": title,
|
| 99 |
+
"url": "",
|
| 100 |
+
"fecha_recoleccion": datetime.utcnow().isoformat(),
|
| 101 |
+
"segmentos_contenido": [
|
| 102 |
+
{
|
| 103 |
+
"type": p.get("type", "p"),
|
| 104 |
+
"text": p.get("text", ""),
|
| 105 |
+
}
|
| 106 |
+
for p in document_paragraphs
|
| 107 |
+
],
|
| 108 |
+
},
|
| 109 |
+
"analisis": {
|
| 110 |
+
"document_sesgo": {
|
| 111 |
+
"sesgos_encontrados": [
|
| 112 |
+
{
|
| 113 |
+
"label": s.get("sesgo", s.get("label", "desconocido")),
|
| 114 |
+
"score": s.get("confianza", s.get("score", 0.0)),
|
| 115 |
+
}
|
| 116 |
+
for s in biases_document.get("sesgos_detectados", [])
|
| 117 |
+
],
|
| 118 |
+
"explicacion": "Sesgos detectados en el documento mediante análisis heurístico.",
|
| 119 |
+
},
|
| 120 |
+
"user_sesgo": {
|
| 121 |
+
"sesgos_encontrados": [
|
| 122 |
+
{
|
| 123 |
+
"label": s.get("sesgo", s.get("label", "desconocido")),
|
| 124 |
+
"score": s.get("confianza", s.get("score", 0.0)),
|
| 125 |
+
}
|
| 126 |
+
for s in biases_user.get("sesgos_detectados", [])
|
| 127 |
+
],
|
| 128 |
+
"explicacion": "Sesgos detectados en el comentario del usuario.",
|
| 129 |
+
},
|
| 130 |
+
"document_distorsion": {
|
| 131 |
+
"veredicto": self.map_veredicto(distortion.get("decision")),
|
| 132 |
+
"contradicciones": contradicciones_formateadas,
|
| 133 |
+
},
|
| 134 |
+
"mineria": mining,
|
| 135 |
+
},
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
# return {
|
| 139 |
+
# "scraped_content": {
|
| 140 |
+
# "title": title,
|
| 141 |
+
# "url": "",
|
| 142 |
+
# "fecha_recoleccion": datetime.utcnow().isoformat(),
|
| 143 |
+
# "segmentos_contenido": [
|
| 144 |
+
# {"type": p["type"], "text": p["text"]} for p in document_paragraphs
|
| 145 |
+
# ],
|
| 146 |
+
# },
|
| 147 |
+
# "analisis": {
|
| 148 |
+
# "document_sesgo": {
|
| 149 |
+
# "sesgos_encontrados": [
|
| 150 |
+
# {
|
| 151 |
+
# "label": s.get("sesgo", s.get("label", "desconocido")),
|
| 152 |
+
# "score": s.get("confianza", s.get("score", 0.0)),
|
| 153 |
+
# }
|
| 154 |
+
# for s in biases["sesgos_detectados"]
|
| 155 |
+
# ],
|
| 156 |
+
# "explicacion": "Sesgos detectados mediante análisis heurístico y de objetividad.",
|
| 157 |
+
# },
|
| 158 |
+
# "document_distorsion": {
|
| 159 |
+
# "veredicto": self.map_veredicto(distortion["decision"]),
|
| 160 |
+
# "contradicciones": [
|
| 161 |
+
# {
|
| 162 |
+
# "parrafo": d["paragraph"],
|
| 163 |
+
# "oracion_usuario": d["sentence"],
|
| 164 |
+
# "claim_extraido": d["cleaned_claim"],
|
| 165 |
+
# "claim_transformado": d.get("transformed_claim", d["cleaned_claim"]),
|
| 166 |
+
# "negacion_detectada": d["is_negation"],
|
| 167 |
+
# "tipo_distorsion": d["best_label"],
|
| 168 |
+
# "puntaje_principal": d["best_score"],
|
| 169 |
+
# "puntajes_detallados": {
|
| 170 |
+
# "contradiccion": round(d["scores_detail"]["contradiction"], 3),
|
| 171 |
+
# "neutral": round(d["scores_detail"]["neutral"], 3),
|
| 172 |
+
# "coincidencia": round(d["scores_detail"]["entailment"], 3),
|
| 173 |
+
# },
|
| 174 |
+
# }
|
| 175 |
+
# for d in distortion["detalles"]
|
| 176 |
+
# ],
|
| 177 |
+
# },
|
| 178 |
+
# "mineria": mining,
|
| 179 |
+
# },
|
| 180 |
+
# }
|
src/index.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI
|
| 2 |
+
from pydantic import BaseModel
|
| 3 |
+
from typing import List
|
| 4 |
+
from contextlib import asynccontextmanager
|
| 5 |
+
from src.engine.analysis_engine import AnalysisEngine
|
| 6 |
+
|
| 7 |
+
engine = None
|
| 8 |
+
|
| 9 |
+
@asynccontextmanager
|
| 10 |
+
async def lifespan(app: FastAPI):
|
| 11 |
+
global engine
|
| 12 |
+
engine = AnalysisEngine()
|
| 13 |
+
yield
|
| 14 |
+
|
| 15 |
+
app = FastAPI(lifespan=lifespan, title="FactChecker Ultra-Ligero")
|
| 16 |
+
|
| 17 |
+
class Paragraph(BaseModel):
|
| 18 |
+
type: str
|
| 19 |
+
text: str
|
| 20 |
+
|
| 21 |
+
class AnalysisRequest(BaseModel):
|
| 22 |
+
title: str
|
| 23 |
+
paragraphs: List[Paragraph]
|
| 24 |
+
user_text: str
|
| 25 |
+
|
| 26 |
+
class RagItem(BaseModel):
|
| 27 |
+
summary: str
|
| 28 |
+
|
| 29 |
+
class RagAnalysisRequest(BaseModel):
|
| 30 |
+
rag_results: List[RagItem]
|
| 31 |
+
user_text: str
|
| 32 |
+
|
| 33 |
+
@app.get("/")
|
| 34 |
+
def home():
|
| 35 |
+
return {"status": "corriendo", "modelos_cargados": "una sola vez"}
|
| 36 |
+
|
| 37 |
+
@app.post("/analyze")
|
| 38 |
+
async def analyze(payload: AnalysisRequest):
|
| 39 |
+
result = engine.analyze(
|
| 40 |
+
user_text=payload.user_text,
|
| 41 |
+
document_paragraphs=[p.dict() for p in payload.paragraphs],
|
| 42 |
+
title=payload.title,
|
| 43 |
+
)
|
| 44 |
+
return result
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
@app.post("/analyze-rag")
|
| 48 |
+
async def analyze_rag(payload: RagAnalysisRequest):
|
| 49 |
+
# 1. Convertir summaries a párrafos estándar del analizador
|
| 50 |
+
rag_paragraphs = [
|
| 51 |
+
{
|
| 52 |
+
"type": "rag_summary",
|
| 53 |
+
"text": item.summary
|
| 54 |
+
}
|
| 55 |
+
for item in payload.rag_results
|
| 56 |
+
]
|
| 57 |
+
|
| 58 |
+
# 2. Ejecutar el MISMO engine
|
| 59 |
+
result = engine.analyze(
|
| 60 |
+
user_text=payload.user_text,
|
| 61 |
+
document_paragraphs=rag_paragraphs,
|
| 62 |
+
title="RAG_CONTEXT"
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
# 3. Marcar que viene de RAG (trazabilidad futura)
|
| 66 |
+
result["source"] = "rag"
|
| 67 |
+
|
| 68 |
+
return result
|
src/mineria/__init__.py
ADDED
|
File without changes
|
src/mineria/mining.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import pipeline
|
| 2 |
+
from keybert import KeyBERT
|
| 3 |
+
from sentence_transformers import SentenceTransformer
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class MiningFeatures:
|
| 7 |
+
def __init__(self):
|
| 8 |
+
self.sentiment = pipeline(
|
| 9 |
+
"sentiment-analysis",
|
| 10 |
+
model="nlptown/bert-base-multilingual-uncased-sentiment",
|
| 11 |
+
)
|
| 12 |
+
embedder = SentenceTransformer(
|
| 13 |
+
"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
|
| 14 |
+
)
|
| 15 |
+
self.kw_model = KeyBERT(model=embedder)
|
| 16 |
+
|
| 17 |
+
def extract(self, text: str):
|
| 18 |
+
sent = self.sentiment(text[:512])[0]
|
| 19 |
+
label = (
|
| 20 |
+
"NEG"
|
| 21 |
+
if "1 star" in sent["label"] or "2 stars" in sent["label"]
|
| 22 |
+
else "NEU" if "3 stars" in sent["label"] else "POS"
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
keywords = self.kw_model.extract_keywords(
|
| 26 |
+
text,
|
| 27 |
+
keyphrase_ngram_range=(1, 3),
|
| 28 |
+
stop_words=["de", "la", "que", "el", "en", "y", "a", "los", "un", "una"],
|
| 29 |
+
top_n=8,
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
return {
|
| 33 |
+
"sentiment": {"label": label, "score": round(sent["score"], 3)},
|
| 34 |
+
"keywords": [k[0] for k in keywords],
|
| 35 |
+
}
|
src/semantic/__init__.py
ADDED
|
File without changes
|
src/semantic/nli_distortion.py
ADDED
|
@@ -0,0 +1,475 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# src/semantic/nli_distortion.py
|
| 2 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
| 3 |
+
import torch
|
| 4 |
+
from typing import List, Dict, Optional
|
| 5 |
+
import re
|
| 6 |
+
|
| 7 |
+
# Embeddings
|
| 8 |
+
from sentence_transformers import SentenceTransformer, util
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class DistortionDetectorNLI:
|
| 12 |
+
def __init__(
|
| 13 |
+
self,
|
| 14 |
+
model_name: str = "MoritzLaurer/deberta-v3-base-mnli-fever-anli",
|
| 15 |
+
emb_model_name: str = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
|
| 16 |
+
similarity_threshold: float = 0.25,
|
| 17 |
+
contradiction_threshold: float = 0.55,
|
| 18 |
+
):
|
| 19 |
+
# NLI model (tu modelo)
|
| 20 |
+
self.model_name = model_name
|
| 21 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 22 |
+
self.model = AutoModelForSequenceClassification.from_pretrained(
|
| 23 |
+
model_name, load_in_4bit=True, device_map="auto", torch_dtype=torch.float16
|
| 24 |
+
)
|
| 25 |
+
self.labels = ["contradiction", "neutral", "entailment"]
|
| 26 |
+
|
| 27 |
+
# Embedding model para matching semántico
|
| 28 |
+
self.embedder = SentenceTransformer(emb_model_name)
|
| 29 |
+
# parámetros
|
| 30 |
+
self.similarity_threshold = similarity_threshold
|
| 31 |
+
self.contradiction_threshold = contradiction_threshold
|
| 32 |
+
|
| 33 |
+
# Heurísticas originales (copiadas/adaptadas)
|
| 34 |
+
self.negative_claims = [
|
| 35 |
+
r"goteras?",
|
| 36 |
+
r"sin\s+equipo",
|
| 37 |
+
r"no\s+funciona",
|
| 38 |
+
r"nunca\s+ha\s+funcionado",
|
| 39 |
+
r"descompuesto",
|
| 40 |
+
r"cartón",
|
| 41 |
+
r"madera\s+pintada",
|
| 42 |
+
r"simular\s+equipo",
|
| 43 |
+
r"maquetas?",
|
| 44 |
+
r"hospitales\s+privados",
|
| 45 |
+
r"se\s+inunda",
|
| 46 |
+
r"filtraciones",
|
| 47 |
+
r"vicios\s+ocultos",
|
| 48 |
+
]
|
| 49 |
+
|
| 50 |
+
self.positive_indicators_doc = [
|
| 51 |
+
r"inauguró",
|
| 52 |
+
r"inversión\s+de\s+\d+",
|
| 53 |
+
r"cuenta\s+con\s+\d+",
|
| 54 |
+
r"atención\s+de\s+calidad",
|
| 55 |
+
r"servicios.*avanzados",
|
| 56 |
+
r"han\s+otorgado\s+\d+",
|
| 57 |
+
]
|
| 58 |
+
|
| 59 |
+
self.scandal_keywords = [
|
| 60 |
+
r"escándalo",
|
| 61 |
+
r"vicios\s+ocultos",
|
| 62 |
+
r"bombo\s+y\s+platillo",
|
| 63 |
+
r"maquillaje",
|
| 64 |
+
]
|
| 65 |
+
|
| 66 |
+
# ---------------------------
|
| 67 |
+
# UTIL: limpieza básica
|
| 68 |
+
# ---------------------------
|
| 69 |
+
def normalize(self, text: str) -> str:
|
| 70 |
+
if not text:
|
| 71 |
+
return ""
|
| 72 |
+
text = text.lower().strip()
|
| 73 |
+
text = re.sub(r"\s+", " ", text)
|
| 74 |
+
return text
|
| 75 |
+
|
| 76 |
+
# ---------------------------
|
| 77 |
+
# UTIL: detector de sarcasmo ligero
|
| 78 |
+
# ---------------------------
|
| 79 |
+
def detect_sarcasm(self, text: str) -> bool:
|
| 80 |
+
if not text:
|
| 81 |
+
return False
|
| 82 |
+
marcas = [
|
| 83 |
+
"felicidades",
|
| 84 |
+
"qué orgullo",
|
| 85 |
+
"claro que sí",
|
| 86 |
+
"ajá",
|
| 87 |
+
"sí cómo no",
|
| 88 |
+
"bravo",
|
| 89 |
+
"qué bien",
|
| 90 |
+
"maravilloso",
|
| 91 |
+
"genial pero",
|
| 92 |
+
"qué orgullo",
|
| 93 |
+
]
|
| 94 |
+
t = text.lower()
|
| 95 |
+
return any(m in t for m in marcas)
|
| 96 |
+
|
| 97 |
+
# ---------------------------
|
| 98 |
+
# UTIL: NLI inference (mantiene tu infer_nli)
|
| 99 |
+
# ---------------------------
|
| 100 |
+
def infer_nli(self, premise: str, hypothesis: str):
|
| 101 |
+
inputs = self.tokenizer(
|
| 102 |
+
premise, hypothesis, return_tensors="pt", truncation=True, max_length=512
|
| 103 |
+
).to(self.model.device)
|
| 104 |
+
|
| 105 |
+
with torch.no_grad():
|
| 106 |
+
logits = self.model(**inputs).logits
|
| 107 |
+
|
| 108 |
+
probs = torch.softmax(logits, dim=1)[0].cpu()
|
| 109 |
+
idx = probs.argmax().item()
|
| 110 |
+
|
| 111 |
+
return {
|
| 112 |
+
"label": self.labels[idx],
|
| 113 |
+
"score": float(probs[idx]),
|
| 114 |
+
"scores": {
|
| 115 |
+
"contradiction": float(probs[0]),
|
| 116 |
+
"neutral": float(probs[1]),
|
| 117 |
+
"entailment": float(probs[2]),
|
| 118 |
+
},
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
# ---------------------------
|
| 122 |
+
# Heurística original (la preservamos)
|
| 123 |
+
# ---------------------------
|
| 124 |
+
def detect_contradiction_by_content(self, user_claim: str, document_text: str) -> dict:
|
| 125 |
+
user_lower = user_claim.lower()
|
| 126 |
+
doc_lower = document_text.lower()
|
| 127 |
+
|
| 128 |
+
neg_count = sum(1 for pattern in self.negative_claims if re.search(pattern, user_lower))
|
| 129 |
+
pos_count = sum(1 for pattern in self.positive_indicators_doc if re.search(pattern, doc_lower))
|
| 130 |
+
|
| 131 |
+
# HEURÍSTICA 1
|
| 132 |
+
if neg_count >= 2 and pos_count >= 2:
|
| 133 |
+
problems_in_doc = sum(1 for pattern in self.negative_claims if re.search(pattern, doc_lower))
|
| 134 |
+
if problems_in_doc == 0:
|
| 135 |
+
confidence = min(0.95, 0.70 + (neg_count * 0.08))
|
| 136 |
+
return {
|
| 137 |
+
"is_contradiction": True,
|
| 138 |
+
"confidence": confidence,
|
| 139 |
+
"reason": f"Usuario afirma {neg_count} problemas no mencionados en documento positivo",
|
| 140 |
+
"method": "heuristic_negative_claims",
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
# HEURÍSTICA 2: negación de datos numéricos
|
| 144 |
+
doc_numbers = re.findall(r"\d+", doc_lower)
|
| 145 |
+
if doc_numbers and any(num in user_lower for num in doc_numbers):
|
| 146 |
+
negation_context = [
|
| 147 |
+
r"en\s+realidad\s+(a|fueron)",
|
| 148 |
+
r"corresponden\s+(a|en)",
|
| 149 |
+
r"nunca\s+ha",
|
| 150 |
+
r"inflar\s+las\s+cifras",
|
| 151 |
+
]
|
| 152 |
+
if any(re.search(pattern, user_lower) for pattern in negation_context):
|
| 153 |
+
return {
|
| 154 |
+
"is_contradiction": True,
|
| 155 |
+
"confidence": 0.88,
|
| 156 |
+
"reason": "Usuario niega datos numéricos específicos del documento",
|
| 157 |
+
"method": "heuristic_data_negation",
|
| 158 |
+
}
|
| 159 |
+
|
| 160 |
+
# HEURÍSTICA 3: Tono de escándalo vs documento positivo
|
| 161 |
+
scandal_count = sum(1 for pattern in self.scandal_keywords if re.search(pattern, user_lower))
|
| 162 |
+
if scandal_count >= 1 and pos_count >= 2:
|
| 163 |
+
return {
|
| 164 |
+
"is_contradiction": True,
|
| 165 |
+
"confidence": 0.82,
|
| 166 |
+
"reason": "Tono de escándalo contradice documento positivo",
|
| 167 |
+
"method": "heuristic_tone_mismatch",
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
return {"is_contradiction": False, "confidence": 0.0, "reason": "No se detectó contradicción heurística", "method": "none"}
|
| 171 |
+
|
| 172 |
+
# ---------------------------
|
| 173 |
+
# UTIL: seleccionar mejor párrafo por similitud semántica
|
| 174 |
+
# ---------------------------
|
| 175 |
+
def best_paragraph_by_similarity(self, claim: str, document_paragraphs: List[Dict[str, str]]) -> Dict:
|
| 176 |
+
"""
|
| 177 |
+
Devuelve tuple: (best_paragraph, best_score)
|
| 178 |
+
Si no pasa el umbral, devuelve (None, best_score)
|
| 179 |
+
"""
|
| 180 |
+
if not document_paragraphs:
|
| 181 |
+
return None, 0.0
|
| 182 |
+
|
| 183 |
+
# crear lista de textos
|
| 184 |
+
texts = [p["text"] for p in document_paragraphs]
|
| 185 |
+
# computar embeddings (normalizados internamente por sentence-transformers)
|
| 186 |
+
emb_claim = self.embedder.encode(claim, convert_to_tensor=True, normalize_embeddings=True)
|
| 187 |
+
emb_texts = self.embedder.encode(texts, convert_to_tensor=True, normalize_embeddings=True)
|
| 188 |
+
|
| 189 |
+
cosine_scores = util.cos_sim(emb_claim, emb_texts)[0].cpu().tolist()
|
| 190 |
+
|
| 191 |
+
best_idx = int(max(range(len(cosine_scores)), key=lambda i: cosine_scores[i]))
|
| 192 |
+
best_score = float(cosine_scores[best_idx])
|
| 193 |
+
best_paragraph = document_paragraphs[best_idx]
|
| 194 |
+
|
| 195 |
+
# si no alcanza umbral, devolvemos None
|
| 196 |
+
if best_score < self.similarity_threshold:
|
| 197 |
+
return None, best_score
|
| 198 |
+
|
| 199 |
+
return best_paragraph, best_score
|
| 200 |
+
|
| 201 |
+
# ---------------------------
|
| 202 |
+
# Extraer y transformar claim (usamos tu función)
|
| 203 |
+
# ---------------------------
|
| 204 |
+
def is_verifiable_claim(self, sentence: str) -> bool:
|
| 205 |
+
sentence_lower = sentence.lower().strip()
|
| 206 |
+
non_verifiable_patterns = [
|
| 207 |
+
r"^nadie\s+(cree|piensa)",
|
| 208 |
+
r"^todos\s+(saben|creen)",
|
| 209 |
+
r"^es\s+(mentira|falso|propaganda)\s*$",
|
| 210 |
+
r"^eso\s+es\s+(mentira|falso)",
|
| 211 |
+
r"^\w+\s+cree\s+",
|
| 212 |
+
r"^\w+\s+piensa\s+",
|
| 213 |
+
r"^otros\s+recuerdan\s+que.*y\s+preguntan",
|
| 214 |
+
r'^usuarios.*comentan.*["\'"].*["\'"]$',
|
| 215 |
+
]
|
| 216 |
+
|
| 217 |
+
if any(re.match(pattern, sentence_lower) for pattern in non_verifiable_patterns):
|
| 218 |
+
return False
|
| 219 |
+
|
| 220 |
+
if sentence_lower.endswith("?") and "si también llevará" in sentence_lower:
|
| 221 |
+
return False
|
| 222 |
+
|
| 223 |
+
has_factual_content = (
|
| 224 |
+
re.search(r"\d+", sentence)
|
| 225 |
+
or re.search(r"%", sentence)
|
| 226 |
+
or re.search(r"\b(aumentado|disminuido|reducido|incrementado|subido|bajado)\b", sentence, re.IGNORECASE)
|
| 227 |
+
or re.search(r"\b(hospital|quirófano|equipo|camas|médicos|estudios|goteras|escándalo)\b", sentence, re.IGNORECASE)
|
| 228 |
+
)
|
| 229 |
+
return bool(has_factual_content)
|
| 230 |
+
|
| 231 |
+
def extract_and_transform_claim(self, sentence: str) -> tuple:
|
| 232 |
+
"""Reutiliza tu lógica original para extraer y transformar claims verificables."""
|
| 233 |
+
if not self.is_verifiable_claim(sentence):
|
| 234 |
+
return None, False, None
|
| 235 |
+
|
| 236 |
+
# Conservamos la misma lógica de extracción y transformaciones
|
| 237 |
+
direct_negation = [
|
| 238 |
+
r"pero\s+(eso\s+)?es\s+falso",
|
| 239 |
+
r"es\s+mentira",
|
| 240 |
+
r"es\s+falso",
|
| 241 |
+
r"no\s+es\s+(cierto|verdad)",
|
| 242 |
+
]
|
| 243 |
+
|
| 244 |
+
indirect_negation = [
|
| 245 |
+
r"es\s+pura\s+propaganda",
|
| 246 |
+
r"es\s+propaganda",
|
| 247 |
+
r"manipulan\s+(las\s+)?cifras",
|
| 248 |
+
r"manipulan\s+(los\s+)?datos",
|
| 249 |
+
r"(ese\s+)?supuesto\s+\d+%",
|
| 250 |
+
r"inflar\s+las\s+cifras",
|
| 251 |
+
r"en\s+realidad\s+a",
|
| 252 |
+
r"corresponden\s+en\s+realidad",
|
| 253 |
+
r"para\s+simular\s+equipo",
|
| 254 |
+
r"maquillaje\s+de\s+la",
|
| 255 |
+
]
|
| 256 |
+
|
| 257 |
+
citation_with_negation = [
|
| 258 |
+
r"presumió.*en\s+realidad",
|
| 259 |
+
r"que\s+dice.*pero\s+en\s+realidad",
|
| 260 |
+
]
|
| 261 |
+
|
| 262 |
+
is_negation = (
|
| 263 |
+
any(re.search(pattern, sentence, re.IGNORECASE) for pattern in direct_negation)
|
| 264 |
+
or any(re.search(pattern, sentence, re.IGNORECASE) for pattern in indirect_negation)
|
| 265 |
+
or any(re.search(pattern, sentence, re.IGNORECASE) for pattern in citation_with_negation)
|
| 266 |
+
)
|
| 267 |
+
|
| 268 |
+
claim = sentence
|
| 269 |
+
|
| 270 |
+
meta_comment_patterns = [
|
| 271 |
+
r'^usuarios\s+irónicos\s+comentan:\s*["\'"]?',
|
| 272 |
+
r"^otros\s+recuerdan\s+que\s+",
|
| 273 |
+
r"^fuentes\s+internas.*revelaron\s+que\s+",
|
| 274 |
+
r"^trabajadores\s+aseguran\s+que\s+",
|
| 275 |
+
r'^.*(comentan|afirman|aseguran|dicen):\s*["\'"]?',
|
| 276 |
+
]
|
| 277 |
+
|
| 278 |
+
for pattern in meta_comment_patterns:
|
| 279 |
+
claim = re.sub(pattern, "", claim, flags=re.IGNORECASE)
|
| 280 |
+
|
| 281 |
+
claim = re.sub(
|
| 282 |
+
r"^.*(dice\s+que|afirma\s+que|según|de\s+acuerdo\s+con|informó\s+que|anunció\s+que)\s+",
|
| 283 |
+
"",
|
| 284 |
+
claim,
|
| 285 |
+
flags=re.IGNORECASE,
|
| 286 |
+
)
|
| 287 |
+
|
| 288 |
+
claim = re.sub(
|
| 289 |
+
r"^(todos|nadie)\s+(sabemos|saben|cree|creen)\s+que\s+",
|
| 290 |
+
"",
|
| 291 |
+
claim,
|
| 292 |
+
flags=re.IGNORECASE,
|
| 293 |
+
)
|
| 294 |
+
claim = re.sub(r"^(ese\s+)?supuesto\s+", "", claim, flags=re.IGNORECASE)
|
| 295 |
+
claim = re.sub(r"\s+(es\s+)?(pura\s+)?propaganda.*$", "", claim, flags=re.IGNORECASE)
|
| 296 |
+
claim = re.sub(r"\s+(pero|y)?\s*(eso\s+)?(es\s+)?(falso|mentira|incorrecto).*$", "", claim, flags=re.IGNORECASE)
|
| 297 |
+
claim = re.sub(r"\s+y\s+que\s+(manipulan|mienten|ocultan).*$", "", claim, flags=re.IGNORECASE)
|
| 298 |
+
claim = re.sub(r"\s+y\s+preguntan\s+si.*$", "", claim, flags=re.IGNORECASE)
|
| 299 |
+
claim = re.sub(r'["\'"]$', "", claim)
|
| 300 |
+
|
| 301 |
+
claim = claim.strip()
|
| 302 |
+
original_claim = claim
|
| 303 |
+
|
| 304 |
+
if len(claim) < 10 or not re.search(r"\d|aumenta|disminuye|reduce|violencia|hospital|quirófano|equipo|goteras|escándalo",
|
| 305 |
+
claim, re.IGNORECASE):
|
| 306 |
+
return None, False, None
|
| 307 |
+
|
| 308 |
+
if is_negation:
|
| 309 |
+
transformations = {
|
| 310 |
+
r"\b(redujeron|disminuyeron|bajaron|descendieron|redujo|disminuyó|reducción|disminución)\b": "aumentaron",
|
| 311 |
+
r"\b(aumentaron|incrementaron|subieron|crecieron|aumentó|incrementó|aumento|incremento)\b": "disminuyeron",
|
| 312 |
+
r"\bcorresponden\s+en\s+realidad\s+a\s+hospitales\s+privados\b": "fueron realizados en el hospital IMSS",
|
| 313 |
+
r"\bpara\s+simular\s+equipo\b": "con equipo real",
|
| 314 |
+
}
|
| 315 |
+
|
| 316 |
+
claim_transformed = claim
|
| 317 |
+
for pattern, replacement in transformations.items():
|
| 318 |
+
if re.search(pattern, claim, re.IGNORECASE):
|
| 319 |
+
claim_transformed = re.sub(pattern, replacement, claim, flags=re.IGNORECASE)
|
| 320 |
+
break
|
| 321 |
+
|
| 322 |
+
if claim_transformed == claim:
|
| 323 |
+
claim_transformed = f"No es cierto que {claim}"
|
| 324 |
+
|
| 325 |
+
claim = claim_transformed
|
| 326 |
+
|
| 327 |
+
elif re.search(r"\b(la\s+)?violencia\s+(ha\s+)?(aumentado|aumentó|incrementó|subió)\b", sentence, re.IGNORECASE):
|
| 328 |
+
is_negation = True
|
| 329 |
+
match = re.search(r"\b(la\s+)?violencia\s+(ha\s+)?(aumentado|aumentó|incrementó|subió)\b", sentence, re.IGNORECASE)
|
| 330 |
+
if match:
|
| 331 |
+
claim = match.group(0)
|
| 332 |
+
original_claim = claim
|
| 333 |
+
|
| 334 |
+
return claim, is_negation, original_claim
|
| 335 |
+
|
| 336 |
+
# ---------------------------
|
| 337 |
+
# MAIN: analizar comentario del usuario (mejorado)
|
| 338 |
+
# ---------------------------
|
| 339 |
+
def analyze_user_comment(self, user_text: str, document_paragraphs: List[Dict[str, str]]):
|
| 340 |
+
# split or sentence segmentation simple
|
| 341 |
+
sentences = []
|
| 342 |
+
temp_sentences = re.split(r"\.\s+|\n+", user_text)
|
| 343 |
+
|
| 344 |
+
for temp_s in temp_sentences:
|
| 345 |
+
if ";" in temp_s and len(temp_s) > 150:
|
| 346 |
+
sub_sentences = temp_s.split(";")
|
| 347 |
+
sentences.extend([s.strip() for s in sub_sentences if s.strip() and len(s.strip()) > 10])
|
| 348 |
+
else:
|
| 349 |
+
if temp_s.strip() and len(temp_s.strip()) > 10:
|
| 350 |
+
sentences.append(temp_s.strip())
|
| 351 |
+
|
| 352 |
+
results = []
|
| 353 |
+
contradictions = 0
|
| 354 |
+
|
| 355 |
+
for s in sentences:
|
| 356 |
+
s_clean = re.sub(r'^[–—\-"\'"]+\s*', "", s)
|
| 357 |
+
transformed_claim, is_negation, original_claim = self.extract_and_transform_claim(s_clean)
|
| 358 |
+
|
| 359 |
+
if transformed_claim is None:
|
| 360 |
+
# no es claim verificable → seguimos a siguiente
|
| 361 |
+
print(f"[DEBUG] ⚠️ Oración filtrada: '{s_clean[:80]}...'")
|
| 362 |
+
continue
|
| 363 |
+
|
| 364 |
+
# SARCASTIC CHECK
|
| 365 |
+
if self.detect_sarcasm(s_clean):
|
| 366 |
+
print("[DEBUG] Detectado sarcasmo/opinión: salto NLI")
|
| 367 |
+
results.append({
|
| 368 |
+
"sentence": s_clean,
|
| 369 |
+
"cleaned_claim": original_claim,
|
| 370 |
+
"transformed_claim": transformed_claim,
|
| 371 |
+
"is_negation": is_negation,
|
| 372 |
+
"best_label": "opinion_sarcastica",
|
| 373 |
+
"best_score": 1.0,
|
| 374 |
+
"detection_method": "sarcasm_detector",
|
| 375 |
+
"paragraph": None,
|
| 376 |
+
"scores_detail": {}
|
| 377 |
+
})
|
| 378 |
+
continue
|
| 379 |
+
|
| 380 |
+
# Selección del párrafo más relevante SEMÁNTICAMENTE
|
| 381 |
+
best_p, sim_score = self.best_paragraph_by_similarity(transformed_claim, document_paragraphs)
|
| 382 |
+
|
| 383 |
+
if best_p is None:
|
| 384 |
+
# No hay párrafo suficientemente similar; no forzamos NLI
|
| 385 |
+
print(f"[DEBUG] No se encontró párrafo relevante (sim_score={sim_score:.3f}) para: '{original_claim[:80]}...'")
|
| 386 |
+
results.append({
|
| 387 |
+
"sentence": s_clean,
|
| 388 |
+
"cleaned_claim": original_claim,
|
| 389 |
+
"transformed_claim": transformed_claim,
|
| 390 |
+
"is_negation": is_negation,
|
| 391 |
+
"best_label": "sin_relacion_factica",
|
| 392 |
+
"best_score": round(sim_score, 3),
|
| 393 |
+
"detection_method": "semantic_filter",
|
| 394 |
+
"paragraph": None,
|
| 395 |
+
"scores_detail": {}
|
| 396 |
+
})
|
| 397 |
+
continue
|
| 398 |
+
|
| 399 |
+
# Ejecutar el NLI entre el párrafo más relevante y el claim transformado
|
| 400 |
+
nli_result = self.infer_nli(best_p["text"], transformed_claim)
|
| 401 |
+
|
| 402 |
+
print(f"\n[DEBUG] ✓ Oración: '{s_clean[:100]}...'")
|
| 403 |
+
print(f"[DEBUG] Claim: '{transformed_claim[:100]}...'")
|
| 404 |
+
print(f"[DEBUG] Sim. párrafo: {sim_score:.3f}")
|
| 405 |
+
print(f"[DEBUG] NLI: {nli_result['label']} ({nli_result['score']:.3f})")
|
| 406 |
+
|
| 407 |
+
final_label = nli_result["label"]
|
| 408 |
+
final_scores = nli_result["scores"]
|
| 409 |
+
detection_method = "nli"
|
| 410 |
+
|
| 411 |
+
# Estrategia híbrida: usar heurística si NLI neutro/entailment con score alto
|
| 412 |
+
if (nli_result["label"] in ["neutral", "entailment"] and nli_result["score"] > 0.65):
|
| 413 |
+
heuristic = self.detect_contradiction_by_content(s_clean, best_p["text"])
|
| 414 |
+
if heuristic["is_contradiction"]:
|
| 415 |
+
final_label = "contradiction"
|
| 416 |
+
final_scores = {
|
| 417 |
+
"contradiction": heuristic["confidence"],
|
| 418 |
+
"neutral": 1 - heuristic["confidence"],
|
| 419 |
+
"entailment": 0.0,
|
| 420 |
+
}
|
| 421 |
+
detection_method = heuristic["method"]
|
| 422 |
+
|
| 423 |
+
# Fallback: si claim es negación pero NLI devuelve entailment, invertimos
|
| 424 |
+
if is_negation and nli_result["label"] == "entailment":
|
| 425 |
+
final_label = "contradiction"
|
| 426 |
+
final_scores = {
|
| 427 |
+
"contradiction": nli_result["scores"]["entailment"],
|
| 428 |
+
"neutral": nli_result["scores"]["neutral"],
|
| 429 |
+
"entailment": nli_result["scores"]["contradiction"],
|
| 430 |
+
}
|
| 431 |
+
detection_method = "nli_fallback"
|
| 432 |
+
|
| 433 |
+
# Decisión robusta: solo considerar contradicción si supera umbral
|
| 434 |
+
if final_scores["contradiction"] >= self.contradiction_threshold:
|
| 435 |
+
considered_label = "contradiction"
|
| 436 |
+
elif final_scores["entailment"] >= 0.55:
|
| 437 |
+
considered_label = "entailment"
|
| 438 |
+
else:
|
| 439 |
+
considered_label = "neutral"
|
| 440 |
+
|
| 441 |
+
results.append({
|
| 442 |
+
"sentence": s_clean,
|
| 443 |
+
"cleaned_claim": original_claim,
|
| 444 |
+
"transformed_claim": transformed_claim,
|
| 445 |
+
"is_negation": is_negation,
|
| 446 |
+
"best_label": considered_label,
|
| 447 |
+
"best_score": round(max(final_scores.values()) if final_scores else 0.0, 3),
|
| 448 |
+
"detection_method": detection_method,
|
| 449 |
+
"paragraph": (best_p["text"][:200] + "...") if len(best_p["text"]) > 200 else best_p["text"],
|
| 450 |
+
"scores_detail": {
|
| 451 |
+
"contradiction": round(final_scores.get("contradiction", 0.0), 3),
|
| 452 |
+
"neutral": round(final_scores.get("neutral", 0.0), 3),
|
| 453 |
+
"entailment": round(final_scores.get("entailment", 0.0), 3),
|
| 454 |
+
}
|
| 455 |
+
})
|
| 456 |
+
|
| 457 |
+
if considered_label == "contradiction":
|
| 458 |
+
contradictions += 1
|
| 459 |
+
|
| 460 |
+
total = len(results)
|
| 461 |
+
if total == 0:
|
| 462 |
+
decision = "sin suficiente información"
|
| 463 |
+
elif contradictions / total > 0.5:
|
| 464 |
+
decision = "distorsiona gravemente"
|
| 465 |
+
elif contradictions > 0:
|
| 466 |
+
decision = "distorsiona parcialmente"
|
| 467 |
+
else:
|
| 468 |
+
decision = "fiel o parcial"
|
| 469 |
+
|
| 470 |
+
return {
|
| 471 |
+
"decision": decision,
|
| 472 |
+
"contradicciones": contradictions,
|
| 473 |
+
"total_oraciones": total,
|
| 474 |
+
"detalles": results,
|
| 475 |
+
}
|
src/semantic/relevance.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sentence_transformers import SentenceTransformer, util
|
| 2 |
+
from typing import List, Dict
|
| 3 |
+
|
| 4 |
+
MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class SemanticRelevance:
|
| 8 |
+
def __init__(self):
|
| 9 |
+
self.model = SentenceTransformer(MODEL_NAME)
|
| 10 |
+
|
| 11 |
+
def relate(self, user_text: str, title: str, paragraphs: List[Dict[str, str]]):
|
| 12 |
+
full_doc = title + ". " + " ".join(p["text"] for p in paragraphs)
|
| 13 |
+
|
| 14 |
+
emb_user = self.model.encode(
|
| 15 |
+
user_text, normalize_embeddings=True, convert_to_tensor=True
|
| 16 |
+
)
|
| 17 |
+
emb_doc = self.model.encode(
|
| 18 |
+
full_doc, normalize_embeddings=True, convert_to_tensor=True
|
| 19 |
+
)
|
| 20 |
+
score_doc = float(util.cos_sim(emb_user, emb_doc)[0][0])
|
| 21 |
+
|
| 22 |
+
per_paragraph = []
|
| 23 |
+
for i, p in enumerate(paragraphs):
|
| 24 |
+
emb_p = self.model.encode(
|
| 25 |
+
p["text"], normalize_embeddings=True, convert_to_tensor=True
|
| 26 |
+
)
|
| 27 |
+
score = float(util.cos_sim(emb_user, emb_p)[0][0])
|
| 28 |
+
per_paragraph.append(
|
| 29 |
+
{
|
| 30 |
+
"index": i,
|
| 31 |
+
"type": p["type"],
|
| 32 |
+
"text": p["text"],
|
| 33 |
+
"score": round(score, 3),
|
| 34 |
+
}
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
if score_doc >= 0.48:
|
| 38 |
+
decision = "muy relacionado"
|
| 39 |
+
elif score_doc >= 0.35:
|
| 40 |
+
decision = "relacionado"
|
| 41 |
+
elif score_doc >= 0.20:
|
| 42 |
+
decision = "tangencial"
|
| 43 |
+
else:
|
| 44 |
+
decision = "no relacionado"
|
| 45 |
+
|
| 46 |
+
best_p = max(per_paragraph, key=lambda x: x["score"])
|
| 47 |
+
|
| 48 |
+
return {
|
| 49 |
+
"decision_document": decision,
|
| 50 |
+
"score_document": round(score_doc, 3),
|
| 51 |
+
"best_paragraph": best_p,
|
| 52 |
+
"per_paragraph": per_paragraph,
|
| 53 |
+
}
|
src/sesgos/__init__.py
ADDED
|
File without changes
|
src/sesgos/sesgos.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import pipeline
|
| 2 |
+
from typing import List, Dict
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
class BiasDetector:
|
| 6 |
+
def __init__(self):
|
| 7 |
+
# Modelo para análisis de sentimiento (opcional)
|
| 8 |
+
self.classifier = pipeline(
|
| 9 |
+
"text-classification",
|
| 10 |
+
model="citizenlab/twitter-xlm-roberta-base-sentiment-finetunned",
|
| 11 |
+
device=0,
|
| 12 |
+
top_k=None,
|
| 13 |
+
)
|
| 14 |
+
|
| 15 |
+
# Indicadores heurísticos de sesgos
|
| 16 |
+
self.bias_keywords = {
|
| 17 |
+
"apelación a la autoridad": [
|
| 18 |
+
"según", "de acuerdo con", "informó", "afirmó", "señaló",
|
| 19 |
+
"SESNSP", "gobierno", "autoridades", "expertos dicen", "oficialmente"
|
| 20 |
+
],
|
| 21 |
+
"generalización apresurada": [
|
| 22 |
+
"todos", "nadie", "siempre", "nunca", "en todos los casos",
|
| 23 |
+
"absolutamente", "todos sabemos", "todo el mundo", "cualquiera sabe",
|
| 24 |
+
"varios de los", # ✅ NUEVO
|
| 25 |
+
],
|
| 26 |
+
"razonamiento emocional": [
|
| 27 |
+
"terrible", "desastroso", "maravilloso", "increíble",
|
| 28 |
+
"!!", "alarmante", "preocupante extremadamente", "indignante",
|
| 29 |
+
"pura propaganda", "mentiras", "mentira", "obviamente falso",
|
| 30 |
+
"escándalo", "bombo y platillo", "vicios ocultos", # ✅ NUEVO
|
| 31 |
+
"maquillaje", "irónicos comentan", "ironía" # ✅ NUEVO
|
| 32 |
+
],
|
| 33 |
+
"falso dilema": [
|
| 34 |
+
"solo hay dos opciones", "o esto o aquello",
|
| 35 |
+
"no hay alternativa", "única solución", "o... o..."
|
| 36 |
+
],
|
| 37 |
+
"ad hominem": [
|
| 38 |
+
"manipulan", "manipula", "mienten", "miente",
|
| 39 |
+
"corruptos", "corrupto", "deshonestos", "deshonesto",
|
| 40 |
+
"inflar las cifras", "para la foto" # ✅ NUEVO
|
| 41 |
+
]
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def detect_heuristic_biases(self, text: str) -> List[Dict]:
|
| 46 |
+
"""Detecta sesgos usando keywords y patrones"""
|
| 47 |
+
detected = []
|
| 48 |
+
text_lower = text.lower()
|
| 49 |
+
|
| 50 |
+
for bias_type, keywords in self.bias_keywords.items():
|
| 51 |
+
matches = sum(1 for kw in keywords if kw.lower() in text_lower)
|
| 52 |
+
|
| 53 |
+
if matches >= 2: # Al menos 2 keywords
|
| 54 |
+
confidence = min(0.95, 0.60 + (matches * 0.10))
|
| 55 |
+
detected.append(
|
| 56 |
+
{
|
| 57 |
+
"sesgo": bias_type,
|
| 58 |
+
"confianza": round(confidence, 3),
|
| 59 |
+
"metodo": "heurístico",
|
| 60 |
+
}
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
return detected
|
| 64 |
+
|
| 65 |
+
def analyze_objectivity(self, text: str) -> Dict:
|
| 66 |
+
"""
|
| 67 |
+
Analiza si el texto es objetivo o subjetivo.
|
| 68 |
+
"""
|
| 69 |
+
# Indicadores de objetividad
|
| 70 |
+
objectivity_markers = [
|
| 71 |
+
"según",
|
| 72 |
+
"de acuerdo con",
|
| 73 |
+
"el informe señala",
|
| 74 |
+
"las cifras muestran",
|
| 75 |
+
"los datos indican",
|
| 76 |
+
"sin embargo",
|
| 77 |
+
"por otro lado",
|
| 78 |
+
"analistas",
|
| 79 |
+
"organizaciones",
|
| 80 |
+
]
|
| 81 |
+
|
| 82 |
+
# Indicadores de subjetividad
|
| 83 |
+
subjectivity_markers = [
|
| 84 |
+
"obviamente",
|
| 85 |
+
"claramente",
|
| 86 |
+
"sin duda",
|
| 87 |
+
"es evidente que",
|
| 88 |
+
"todos saben",
|
| 89 |
+
"la verdad es",
|
| 90 |
+
]
|
| 91 |
+
|
| 92 |
+
text_lower = text.lower()
|
| 93 |
+
obj_score = sum(1 for m in objectivity_markers if m in text_lower)
|
| 94 |
+
subj_score = sum(1 for m in subjectivity_markers if m in text_lower)
|
| 95 |
+
|
| 96 |
+
return {
|
| 97 |
+
"objetivo": obj_score,
|
| 98 |
+
"subjetivo": subj_score,
|
| 99 |
+
"ratio": obj_score / max(subj_score, 1),
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
def detect(self, text: str):
|
| 103 |
+
"""
|
| 104 |
+
Método principal de detección de sesgos.
|
| 105 |
+
"""
|
| 106 |
+
# Análisis heurístico
|
| 107 |
+
heuristic_biases = self.detect_heuristic_biases(text)
|
| 108 |
+
|
| 109 |
+
# Análisis de objetividad
|
| 110 |
+
objectivity = self.analyze_objectivity(text)
|
| 111 |
+
|
| 112 |
+
# Si el texto es muy objetivo, reducir confianza en sesgos
|
| 113 |
+
if objectivity["ratio"] > 3 and len(heuristic_biases) > 0:
|
| 114 |
+
for bias in heuristic_biases:
|
| 115 |
+
bias["confianza"] = max(0.60, bias["confianza"] - 0.20)
|
| 116 |
+
|
| 117 |
+
# Si no hay sesgos detectados
|
| 118 |
+
if not heuristic_biases:
|
| 119 |
+
if objectivity["ratio"] > 2:
|
| 120 |
+
return {
|
| 121 |
+
"sesgos_detectados": [
|
| 122 |
+
{
|
| 123 |
+
"sesgo": "texto objetivo",
|
| 124 |
+
"confianza": 1.0,
|
| 125 |
+
"metodo": "análisis de objetividad",
|
| 126 |
+
}
|
| 127 |
+
]
|
| 128 |
+
}
|
| 129 |
+
else:
|
| 130 |
+
return {
|
| 131 |
+
"sesgos_detectados": [
|
| 132 |
+
{
|
| 133 |
+
"sesgo": "sin sesgos detectables",
|
| 134 |
+
"confianza": 1.0,
|
| 135 |
+
"metodo": "análisis heurístico",
|
| 136 |
+
}
|
| 137 |
+
]
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
return {"sesgos_detectados": heuristic_biases}
|