RodrickMJ commited on
Commit
f1db1e3
·
1 Parent(s): e2848af

Add application file

Browse files
Dockerfile ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10
2
+
3
+ RUN useradd -m -u 1000 user
4
+ USER user
5
+ ENV PATH="/home/user/.local/bin:$PATH"
6
+
7
+ WORKDIR /app
8
+
9
+ COPY --chown=user ./requirements.txt requirements.txt
10
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
11
+
12
+ COPY --chown=user . /app
13
+
14
+ CMD ["uvicorn", "src.index:app", "--host", "0.0.0.0", "--port", "7860"]
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn[standard]
3
+ torch>=2.1.0
4
+ transformers>=4.36
5
+ sentence-transformers>=2.2.2
6
+ keybert>=0.8.0
7
+ bitsandbytes>=0.41.0
8
+ accelerate
9
+ scikit-learn
src/__init__.py ADDED
File without changes
src/engine/__init__.py ADDED
File without changes
src/engine/analysis_engine.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.semantic.relevance import SemanticRelevance
2
+ from src.semantic.nli_distortion import DistortionDetectorNLI
3
+ from src.sesgos.sesgos import BiasDetector
4
+ from src.mineria.mining import MiningFeatures
5
+ from datetime import datetime
6
+
7
+
8
+ class AnalysisEngine:
9
+ def __init__(self):
10
+ self.relevance = SemanticRelevance()
11
+ self.distortion = DistortionDetectorNLI()
12
+ self.bias = BiasDetector()
13
+ self.mining = MiningFeatures()
14
+
15
+ def map_veredicto(self, decision: str):
16
+ if not decision:
17
+ return "indefinido"
18
+
19
+ decision = decision.lower()
20
+
21
+ if "gravemente" in decision:
22
+ return "distorsion"
23
+ if "parcial" in decision:
24
+ return "parcial"
25
+ if "neutral" in decision:
26
+ return "neutral"
27
+
28
+ return "correcto"
29
+
30
+ def analyze(self, user_text: str, document_paragraphs: list, title: str):
31
+ # 1️⃣ RELEVANCIA GENERAL
32
+ rel = self.relevance.relate(user_text, title, document_paragraphs)
33
+
34
+ # ✅ FILTRO ESPECIAL PARA RAG
35
+ if title == "RAG_CONTEXT":
36
+ best_score = rel["best_paragraph"]["score"]
37
+ if best_score < 0.25:
38
+ return {
39
+ "status": "rag_irrelevante",
40
+ "relevance": rel,
41
+ "message": "El contexto recuperado por el RAG no es relevante al comentario del usuario.",
42
+ }
43
+
44
+ # ✅ FILTRO NORMAL PARA SCRAPING
45
+ if title != "RAG_CONTEXT" and rel["decision_document"] in [
46
+ "no relacionado",
47
+ "tangencial",
48
+ ]:
49
+ return {
50
+ "status": "poco_relevante",
51
+ "relevance": rel,
52
+ }
53
+
54
+ # 2️⃣ DISTORSIÓN (NLI + HEURÍSTICAS)
55
+ distortion = self.distortion.analyze_user_comment(
56
+ user_text, document_paragraphs
57
+ )
58
+
59
+ # 3️⃣ SESGOS DEL DOCUMENTO
60
+ full_document_text = (
61
+ title + ". " + " ".join(p.get("text", "") for p in document_paragraphs)
62
+ )
63
+ biases_document = self.bias.detect(full_document_text)
64
+
65
+ # 4️⃣ SESGOS DEL USUARIO
66
+ biases_user = self.bias.detect(user_text)
67
+
68
+ # 5️⃣ MINERÍA DE TEXTO
69
+ mining = self.mining.extract(user_text)
70
+
71
+ # ✅ ARMADO FINAL PROTEGIDO CONTRA ERRORES DE CLAVES
72
+ contradicciones_formateadas = []
73
+
74
+ for d in distortion.get("detalles", []):
75
+ scores = d.get("scores_detail", {}) or {}
76
+
77
+ contradicciones_formateadas.append(
78
+ {
79
+ "parrafo": d.get("paragraph", ""),
80
+ "oracion_usuario": d.get("sentence", ""),
81
+ "claim_extraido": d.get("cleaned_claim"),
82
+ "claim_transformado": d.get(
83
+ "transformed_claim", d.get("cleaned_claim")
84
+ ),
85
+ "negacion_detectada": d.get("is_negation", False),
86
+ "tipo_distorsion": d.get("best_label", "neutral"),
87
+ "puntaje_principal": round(d.get("best_score", 0.0), 3),
88
+ "puntajes_detallados": {
89
+ "contradiccion": round(scores.get("contradiction", 0.0), 3),
90
+ "neutral": round(scores.get("neutral", 0.0), 3),
91
+ "coincidencia": round(scores.get("entailment", 0.0), 3),
92
+ },
93
+ }
94
+ )
95
+
96
+ return {
97
+ "scraped_content": {
98
+ "title": title,
99
+ "url": "",
100
+ "fecha_recoleccion": datetime.utcnow().isoformat(),
101
+ "segmentos_contenido": [
102
+ {
103
+ "type": p.get("type", "p"),
104
+ "text": p.get("text", ""),
105
+ }
106
+ for p in document_paragraphs
107
+ ],
108
+ },
109
+ "analisis": {
110
+ "document_sesgo": {
111
+ "sesgos_encontrados": [
112
+ {
113
+ "label": s.get("sesgo", s.get("label", "desconocido")),
114
+ "score": s.get("confianza", s.get("score", 0.0)),
115
+ }
116
+ for s in biases_document.get("sesgos_detectados", [])
117
+ ],
118
+ "explicacion": "Sesgos detectados en el documento mediante análisis heurístico.",
119
+ },
120
+ "user_sesgo": {
121
+ "sesgos_encontrados": [
122
+ {
123
+ "label": s.get("sesgo", s.get("label", "desconocido")),
124
+ "score": s.get("confianza", s.get("score", 0.0)),
125
+ }
126
+ for s in biases_user.get("sesgos_detectados", [])
127
+ ],
128
+ "explicacion": "Sesgos detectados en el comentario del usuario.",
129
+ },
130
+ "document_distorsion": {
131
+ "veredicto": self.map_veredicto(distortion.get("decision")),
132
+ "contradicciones": contradicciones_formateadas,
133
+ },
134
+ "mineria": mining,
135
+ },
136
+ }
137
+
138
+ # return {
139
+ # "scraped_content": {
140
+ # "title": title,
141
+ # "url": "",
142
+ # "fecha_recoleccion": datetime.utcnow().isoformat(),
143
+ # "segmentos_contenido": [
144
+ # {"type": p["type"], "text": p["text"]} for p in document_paragraphs
145
+ # ],
146
+ # },
147
+ # "analisis": {
148
+ # "document_sesgo": {
149
+ # "sesgos_encontrados": [
150
+ # {
151
+ # "label": s.get("sesgo", s.get("label", "desconocido")),
152
+ # "score": s.get("confianza", s.get("score", 0.0)),
153
+ # }
154
+ # for s in biases["sesgos_detectados"]
155
+ # ],
156
+ # "explicacion": "Sesgos detectados mediante análisis heurístico y de objetividad.",
157
+ # },
158
+ # "document_distorsion": {
159
+ # "veredicto": self.map_veredicto(distortion["decision"]),
160
+ # "contradicciones": [
161
+ # {
162
+ # "parrafo": d["paragraph"],
163
+ # "oracion_usuario": d["sentence"],
164
+ # "claim_extraido": d["cleaned_claim"],
165
+ # "claim_transformado": d.get("transformed_claim", d["cleaned_claim"]),
166
+ # "negacion_detectada": d["is_negation"],
167
+ # "tipo_distorsion": d["best_label"],
168
+ # "puntaje_principal": d["best_score"],
169
+ # "puntajes_detallados": {
170
+ # "contradiccion": round(d["scores_detail"]["contradiction"], 3),
171
+ # "neutral": round(d["scores_detail"]["neutral"], 3),
172
+ # "coincidencia": round(d["scores_detail"]["entailment"], 3),
173
+ # },
174
+ # }
175
+ # for d in distortion["detalles"]
176
+ # ],
177
+ # },
178
+ # "mineria": mining,
179
+ # },
180
+ # }
src/index.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from pydantic import BaseModel
3
+ from typing import List
4
+ from contextlib import asynccontextmanager
5
+ from src.engine.analysis_engine import AnalysisEngine
6
+
7
+ engine = None
8
+
9
+ @asynccontextmanager
10
+ async def lifespan(app: FastAPI):
11
+ global engine
12
+ engine = AnalysisEngine()
13
+ yield
14
+
15
+ app = FastAPI(lifespan=lifespan, title="FactChecker Ultra-Ligero")
16
+
17
+ class Paragraph(BaseModel):
18
+ type: str
19
+ text: str
20
+
21
+ class AnalysisRequest(BaseModel):
22
+ title: str
23
+ paragraphs: List[Paragraph]
24
+ user_text: str
25
+
26
+ class RagItem(BaseModel):
27
+ summary: str
28
+
29
+ class RagAnalysisRequest(BaseModel):
30
+ rag_results: List[RagItem]
31
+ user_text: str
32
+
33
+ @app.get("/")
34
+ def home():
35
+ return {"status": "corriendo", "modelos_cargados": "una sola vez"}
36
+
37
+ @app.post("/analyze")
38
+ async def analyze(payload: AnalysisRequest):
39
+ result = engine.analyze(
40
+ user_text=payload.user_text,
41
+ document_paragraphs=[p.dict() for p in payload.paragraphs],
42
+ title=payload.title,
43
+ )
44
+ return result
45
+
46
+
47
+ @app.post("/analyze-rag")
48
+ async def analyze_rag(payload: RagAnalysisRequest):
49
+ # 1. Convertir summaries a párrafos estándar del analizador
50
+ rag_paragraphs = [
51
+ {
52
+ "type": "rag_summary",
53
+ "text": item.summary
54
+ }
55
+ for item in payload.rag_results
56
+ ]
57
+
58
+ # 2. Ejecutar el MISMO engine
59
+ result = engine.analyze(
60
+ user_text=payload.user_text,
61
+ document_paragraphs=rag_paragraphs,
62
+ title="RAG_CONTEXT"
63
+ )
64
+
65
+ # 3. Marcar que viene de RAG (trazabilidad futura)
66
+ result["source"] = "rag"
67
+
68
+ return result
src/mineria/__init__.py ADDED
File without changes
src/mineria/mining.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline
2
+ from keybert import KeyBERT
3
+ from sentence_transformers import SentenceTransformer
4
+
5
+
6
+ class MiningFeatures:
7
+ def __init__(self):
8
+ self.sentiment = pipeline(
9
+ "sentiment-analysis",
10
+ model="nlptown/bert-base-multilingual-uncased-sentiment",
11
+ )
12
+ embedder = SentenceTransformer(
13
+ "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
14
+ )
15
+ self.kw_model = KeyBERT(model=embedder)
16
+
17
+ def extract(self, text: str):
18
+ sent = self.sentiment(text[:512])[0]
19
+ label = (
20
+ "NEG"
21
+ if "1 star" in sent["label"] or "2 stars" in sent["label"]
22
+ else "NEU" if "3 stars" in sent["label"] else "POS"
23
+ )
24
+
25
+ keywords = self.kw_model.extract_keywords(
26
+ text,
27
+ keyphrase_ngram_range=(1, 3),
28
+ stop_words=["de", "la", "que", "el", "en", "y", "a", "los", "un", "una"],
29
+ top_n=8,
30
+ )
31
+
32
+ return {
33
+ "sentiment": {"label": label, "score": round(sent["score"], 3)},
34
+ "keywords": [k[0] for k in keywords],
35
+ }
src/semantic/__init__.py ADDED
File without changes
src/semantic/nli_distortion.py ADDED
@@ -0,0 +1,475 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/semantic/nli_distortion.py
2
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
3
+ import torch
4
+ from typing import List, Dict, Optional
5
+ import re
6
+
7
+ # Embeddings
8
+ from sentence_transformers import SentenceTransformer, util
9
+
10
+
11
+ class DistortionDetectorNLI:
12
+ def __init__(
13
+ self,
14
+ model_name: str = "MoritzLaurer/deberta-v3-base-mnli-fever-anli",
15
+ emb_model_name: str = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
16
+ similarity_threshold: float = 0.25,
17
+ contradiction_threshold: float = 0.55,
18
+ ):
19
+ # NLI model (tu modelo)
20
+ self.model_name = model_name
21
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
22
+ self.model = AutoModelForSequenceClassification.from_pretrained(
23
+ model_name, load_in_4bit=True, device_map="auto", torch_dtype=torch.float16
24
+ )
25
+ self.labels = ["contradiction", "neutral", "entailment"]
26
+
27
+ # Embedding model para matching semántico
28
+ self.embedder = SentenceTransformer(emb_model_name)
29
+ # parámetros
30
+ self.similarity_threshold = similarity_threshold
31
+ self.contradiction_threshold = contradiction_threshold
32
+
33
+ # Heurísticas originales (copiadas/adaptadas)
34
+ self.negative_claims = [
35
+ r"goteras?",
36
+ r"sin\s+equipo",
37
+ r"no\s+funciona",
38
+ r"nunca\s+ha\s+funcionado",
39
+ r"descompuesto",
40
+ r"cartón",
41
+ r"madera\s+pintada",
42
+ r"simular\s+equipo",
43
+ r"maquetas?",
44
+ r"hospitales\s+privados",
45
+ r"se\s+inunda",
46
+ r"filtraciones",
47
+ r"vicios\s+ocultos",
48
+ ]
49
+
50
+ self.positive_indicators_doc = [
51
+ r"inauguró",
52
+ r"inversión\s+de\s+\d+",
53
+ r"cuenta\s+con\s+\d+",
54
+ r"atención\s+de\s+calidad",
55
+ r"servicios.*avanzados",
56
+ r"han\s+otorgado\s+\d+",
57
+ ]
58
+
59
+ self.scandal_keywords = [
60
+ r"escándalo",
61
+ r"vicios\s+ocultos",
62
+ r"bombo\s+y\s+platillo",
63
+ r"maquillaje",
64
+ ]
65
+
66
+ # ---------------------------
67
+ # UTIL: limpieza básica
68
+ # ---------------------------
69
+ def normalize(self, text: str) -> str:
70
+ if not text:
71
+ return ""
72
+ text = text.lower().strip()
73
+ text = re.sub(r"\s+", " ", text)
74
+ return text
75
+
76
+ # ---------------------------
77
+ # UTIL: detector de sarcasmo ligero
78
+ # ---------------------------
79
+ def detect_sarcasm(self, text: str) -> bool:
80
+ if not text:
81
+ return False
82
+ marcas = [
83
+ "felicidades",
84
+ "qué orgullo",
85
+ "claro que sí",
86
+ "ajá",
87
+ "sí cómo no",
88
+ "bravo",
89
+ "qué bien",
90
+ "maravilloso",
91
+ "genial pero",
92
+ "qué orgullo",
93
+ ]
94
+ t = text.lower()
95
+ return any(m in t for m in marcas)
96
+
97
+ # ---------------------------
98
+ # UTIL: NLI inference (mantiene tu infer_nli)
99
+ # ---------------------------
100
+ def infer_nli(self, premise: str, hypothesis: str):
101
+ inputs = self.tokenizer(
102
+ premise, hypothesis, return_tensors="pt", truncation=True, max_length=512
103
+ ).to(self.model.device)
104
+
105
+ with torch.no_grad():
106
+ logits = self.model(**inputs).logits
107
+
108
+ probs = torch.softmax(logits, dim=1)[0].cpu()
109
+ idx = probs.argmax().item()
110
+
111
+ return {
112
+ "label": self.labels[idx],
113
+ "score": float(probs[idx]),
114
+ "scores": {
115
+ "contradiction": float(probs[0]),
116
+ "neutral": float(probs[1]),
117
+ "entailment": float(probs[2]),
118
+ },
119
+ }
120
+
121
+ # ---------------------------
122
+ # Heurística original (la preservamos)
123
+ # ---------------------------
124
+ def detect_contradiction_by_content(self, user_claim: str, document_text: str) -> dict:
125
+ user_lower = user_claim.lower()
126
+ doc_lower = document_text.lower()
127
+
128
+ neg_count = sum(1 for pattern in self.negative_claims if re.search(pattern, user_lower))
129
+ pos_count = sum(1 for pattern in self.positive_indicators_doc if re.search(pattern, doc_lower))
130
+
131
+ # HEURÍSTICA 1
132
+ if neg_count >= 2 and pos_count >= 2:
133
+ problems_in_doc = sum(1 for pattern in self.negative_claims if re.search(pattern, doc_lower))
134
+ if problems_in_doc == 0:
135
+ confidence = min(0.95, 0.70 + (neg_count * 0.08))
136
+ return {
137
+ "is_contradiction": True,
138
+ "confidence": confidence,
139
+ "reason": f"Usuario afirma {neg_count} problemas no mencionados en documento positivo",
140
+ "method": "heuristic_negative_claims",
141
+ }
142
+
143
+ # HEURÍSTICA 2: negación de datos numéricos
144
+ doc_numbers = re.findall(r"\d+", doc_lower)
145
+ if doc_numbers and any(num in user_lower for num in doc_numbers):
146
+ negation_context = [
147
+ r"en\s+realidad\s+(a|fueron)",
148
+ r"corresponden\s+(a|en)",
149
+ r"nunca\s+ha",
150
+ r"inflar\s+las\s+cifras",
151
+ ]
152
+ if any(re.search(pattern, user_lower) for pattern in negation_context):
153
+ return {
154
+ "is_contradiction": True,
155
+ "confidence": 0.88,
156
+ "reason": "Usuario niega datos numéricos específicos del documento",
157
+ "method": "heuristic_data_negation",
158
+ }
159
+
160
+ # HEURÍSTICA 3: Tono de escándalo vs documento positivo
161
+ scandal_count = sum(1 for pattern in self.scandal_keywords if re.search(pattern, user_lower))
162
+ if scandal_count >= 1 and pos_count >= 2:
163
+ return {
164
+ "is_contradiction": True,
165
+ "confidence": 0.82,
166
+ "reason": "Tono de escándalo contradice documento positivo",
167
+ "method": "heuristic_tone_mismatch",
168
+ }
169
+
170
+ return {"is_contradiction": False, "confidence": 0.0, "reason": "No se detectó contradicción heurística", "method": "none"}
171
+
172
+ # ---------------------------
173
+ # UTIL: seleccionar mejor párrafo por similitud semántica
174
+ # ---------------------------
175
+ def best_paragraph_by_similarity(self, claim: str, document_paragraphs: List[Dict[str, str]]) -> Dict:
176
+ """
177
+ Devuelve tuple: (best_paragraph, best_score)
178
+ Si no pasa el umbral, devuelve (None, best_score)
179
+ """
180
+ if not document_paragraphs:
181
+ return None, 0.0
182
+
183
+ # crear lista de textos
184
+ texts = [p["text"] for p in document_paragraphs]
185
+ # computar embeddings (normalizados internamente por sentence-transformers)
186
+ emb_claim = self.embedder.encode(claim, convert_to_tensor=True, normalize_embeddings=True)
187
+ emb_texts = self.embedder.encode(texts, convert_to_tensor=True, normalize_embeddings=True)
188
+
189
+ cosine_scores = util.cos_sim(emb_claim, emb_texts)[0].cpu().tolist()
190
+
191
+ best_idx = int(max(range(len(cosine_scores)), key=lambda i: cosine_scores[i]))
192
+ best_score = float(cosine_scores[best_idx])
193
+ best_paragraph = document_paragraphs[best_idx]
194
+
195
+ # si no alcanza umbral, devolvemos None
196
+ if best_score < self.similarity_threshold:
197
+ return None, best_score
198
+
199
+ return best_paragraph, best_score
200
+
201
+ # ---------------------------
202
+ # Extraer y transformar claim (usamos tu función)
203
+ # ---------------------------
204
+ def is_verifiable_claim(self, sentence: str) -> bool:
205
+ sentence_lower = sentence.lower().strip()
206
+ non_verifiable_patterns = [
207
+ r"^nadie\s+(cree|piensa)",
208
+ r"^todos\s+(saben|creen)",
209
+ r"^es\s+(mentira|falso|propaganda)\s*$",
210
+ r"^eso\s+es\s+(mentira|falso)",
211
+ r"^\w+\s+cree\s+",
212
+ r"^\w+\s+piensa\s+",
213
+ r"^otros\s+recuerdan\s+que.*y\s+preguntan",
214
+ r'^usuarios.*comentan.*["\'"].*["\'"]$',
215
+ ]
216
+
217
+ if any(re.match(pattern, sentence_lower) for pattern in non_verifiable_patterns):
218
+ return False
219
+
220
+ if sentence_lower.endswith("?") and "si también llevará" in sentence_lower:
221
+ return False
222
+
223
+ has_factual_content = (
224
+ re.search(r"\d+", sentence)
225
+ or re.search(r"%", sentence)
226
+ or re.search(r"\b(aumentado|disminuido|reducido|incrementado|subido|bajado)\b", sentence, re.IGNORECASE)
227
+ or re.search(r"\b(hospital|quirófano|equipo|camas|médicos|estudios|goteras|escándalo)\b", sentence, re.IGNORECASE)
228
+ )
229
+ return bool(has_factual_content)
230
+
231
+ def extract_and_transform_claim(self, sentence: str) -> tuple:
232
+ """Reutiliza tu lógica original para extraer y transformar claims verificables."""
233
+ if not self.is_verifiable_claim(sentence):
234
+ return None, False, None
235
+
236
+ # Conservamos la misma lógica de extracción y transformaciones
237
+ direct_negation = [
238
+ r"pero\s+(eso\s+)?es\s+falso",
239
+ r"es\s+mentira",
240
+ r"es\s+falso",
241
+ r"no\s+es\s+(cierto|verdad)",
242
+ ]
243
+
244
+ indirect_negation = [
245
+ r"es\s+pura\s+propaganda",
246
+ r"es\s+propaganda",
247
+ r"manipulan\s+(las\s+)?cifras",
248
+ r"manipulan\s+(los\s+)?datos",
249
+ r"(ese\s+)?supuesto\s+\d+%",
250
+ r"inflar\s+las\s+cifras",
251
+ r"en\s+realidad\s+a",
252
+ r"corresponden\s+en\s+realidad",
253
+ r"para\s+simular\s+equipo",
254
+ r"maquillaje\s+de\s+la",
255
+ ]
256
+
257
+ citation_with_negation = [
258
+ r"presumió.*en\s+realidad",
259
+ r"que\s+dice.*pero\s+en\s+realidad",
260
+ ]
261
+
262
+ is_negation = (
263
+ any(re.search(pattern, sentence, re.IGNORECASE) for pattern in direct_negation)
264
+ or any(re.search(pattern, sentence, re.IGNORECASE) for pattern in indirect_negation)
265
+ or any(re.search(pattern, sentence, re.IGNORECASE) for pattern in citation_with_negation)
266
+ )
267
+
268
+ claim = sentence
269
+
270
+ meta_comment_patterns = [
271
+ r'^usuarios\s+irónicos\s+comentan:\s*["\'"]?',
272
+ r"^otros\s+recuerdan\s+que\s+",
273
+ r"^fuentes\s+internas.*revelaron\s+que\s+",
274
+ r"^trabajadores\s+aseguran\s+que\s+",
275
+ r'^.*(comentan|afirman|aseguran|dicen):\s*["\'"]?',
276
+ ]
277
+
278
+ for pattern in meta_comment_patterns:
279
+ claim = re.sub(pattern, "", claim, flags=re.IGNORECASE)
280
+
281
+ claim = re.sub(
282
+ r"^.*(dice\s+que|afirma\s+que|según|de\s+acuerdo\s+con|informó\s+que|anunció\s+que)\s+",
283
+ "",
284
+ claim,
285
+ flags=re.IGNORECASE,
286
+ )
287
+
288
+ claim = re.sub(
289
+ r"^(todos|nadie)\s+(sabemos|saben|cree|creen)\s+que\s+",
290
+ "",
291
+ claim,
292
+ flags=re.IGNORECASE,
293
+ )
294
+ claim = re.sub(r"^(ese\s+)?supuesto\s+", "", claim, flags=re.IGNORECASE)
295
+ claim = re.sub(r"\s+(es\s+)?(pura\s+)?propaganda.*$", "", claim, flags=re.IGNORECASE)
296
+ claim = re.sub(r"\s+(pero|y)?\s*(eso\s+)?(es\s+)?(falso|mentira|incorrecto).*$", "", claim, flags=re.IGNORECASE)
297
+ claim = re.sub(r"\s+y\s+que\s+(manipulan|mienten|ocultan).*$", "", claim, flags=re.IGNORECASE)
298
+ claim = re.sub(r"\s+y\s+preguntan\s+si.*$", "", claim, flags=re.IGNORECASE)
299
+ claim = re.sub(r'["\'"]$', "", claim)
300
+
301
+ claim = claim.strip()
302
+ original_claim = claim
303
+
304
+ if len(claim) < 10 or not re.search(r"\d|aumenta|disminuye|reduce|violencia|hospital|quirófano|equipo|goteras|escándalo",
305
+ claim, re.IGNORECASE):
306
+ return None, False, None
307
+
308
+ if is_negation:
309
+ transformations = {
310
+ r"\b(redujeron|disminuyeron|bajaron|descendieron|redujo|disminuyó|reducción|disminución)\b": "aumentaron",
311
+ r"\b(aumentaron|incrementaron|subieron|crecieron|aumentó|incrementó|aumento|incremento)\b": "disminuyeron",
312
+ r"\bcorresponden\s+en\s+realidad\s+a\s+hospitales\s+privados\b": "fueron realizados en el hospital IMSS",
313
+ r"\bpara\s+simular\s+equipo\b": "con equipo real",
314
+ }
315
+
316
+ claim_transformed = claim
317
+ for pattern, replacement in transformations.items():
318
+ if re.search(pattern, claim, re.IGNORECASE):
319
+ claim_transformed = re.sub(pattern, replacement, claim, flags=re.IGNORECASE)
320
+ break
321
+
322
+ if claim_transformed == claim:
323
+ claim_transformed = f"No es cierto que {claim}"
324
+
325
+ claim = claim_transformed
326
+
327
+ elif re.search(r"\b(la\s+)?violencia\s+(ha\s+)?(aumentado|aumentó|incrementó|subió)\b", sentence, re.IGNORECASE):
328
+ is_negation = True
329
+ match = re.search(r"\b(la\s+)?violencia\s+(ha\s+)?(aumentado|aumentó|incrementó|subió)\b", sentence, re.IGNORECASE)
330
+ if match:
331
+ claim = match.group(0)
332
+ original_claim = claim
333
+
334
+ return claim, is_negation, original_claim
335
+
336
+ # ---------------------------
337
+ # MAIN: analizar comentario del usuario (mejorado)
338
+ # ---------------------------
339
+ def analyze_user_comment(self, user_text: str, document_paragraphs: List[Dict[str, str]]):
340
+ # split or sentence segmentation simple
341
+ sentences = []
342
+ temp_sentences = re.split(r"\.\s+|\n+", user_text)
343
+
344
+ for temp_s in temp_sentences:
345
+ if ";" in temp_s and len(temp_s) > 150:
346
+ sub_sentences = temp_s.split(";")
347
+ sentences.extend([s.strip() for s in sub_sentences if s.strip() and len(s.strip()) > 10])
348
+ else:
349
+ if temp_s.strip() and len(temp_s.strip()) > 10:
350
+ sentences.append(temp_s.strip())
351
+
352
+ results = []
353
+ contradictions = 0
354
+
355
+ for s in sentences:
356
+ s_clean = re.sub(r'^[–—\-"\'"]+\s*', "", s)
357
+ transformed_claim, is_negation, original_claim = self.extract_and_transform_claim(s_clean)
358
+
359
+ if transformed_claim is None:
360
+ # no es claim verificable → seguimos a siguiente
361
+ print(f"[DEBUG] ⚠️ Oración filtrada: '{s_clean[:80]}...'")
362
+ continue
363
+
364
+ # SARCASTIC CHECK
365
+ if self.detect_sarcasm(s_clean):
366
+ print("[DEBUG] Detectado sarcasmo/opinión: salto NLI")
367
+ results.append({
368
+ "sentence": s_clean,
369
+ "cleaned_claim": original_claim,
370
+ "transformed_claim": transformed_claim,
371
+ "is_negation": is_negation,
372
+ "best_label": "opinion_sarcastica",
373
+ "best_score": 1.0,
374
+ "detection_method": "sarcasm_detector",
375
+ "paragraph": None,
376
+ "scores_detail": {}
377
+ })
378
+ continue
379
+
380
+ # Selección del párrafo más relevante SEMÁNTICAMENTE
381
+ best_p, sim_score = self.best_paragraph_by_similarity(transformed_claim, document_paragraphs)
382
+
383
+ if best_p is None:
384
+ # No hay párrafo suficientemente similar; no forzamos NLI
385
+ print(f"[DEBUG] No se encontró párrafo relevante (sim_score={sim_score:.3f}) para: '{original_claim[:80]}...'")
386
+ results.append({
387
+ "sentence": s_clean,
388
+ "cleaned_claim": original_claim,
389
+ "transformed_claim": transformed_claim,
390
+ "is_negation": is_negation,
391
+ "best_label": "sin_relacion_factica",
392
+ "best_score": round(sim_score, 3),
393
+ "detection_method": "semantic_filter",
394
+ "paragraph": None,
395
+ "scores_detail": {}
396
+ })
397
+ continue
398
+
399
+ # Ejecutar el NLI entre el párrafo más relevante y el claim transformado
400
+ nli_result = self.infer_nli(best_p["text"], transformed_claim)
401
+
402
+ print(f"\n[DEBUG] ✓ Oración: '{s_clean[:100]}...'")
403
+ print(f"[DEBUG] Claim: '{transformed_claim[:100]}...'")
404
+ print(f"[DEBUG] Sim. párrafo: {sim_score:.3f}")
405
+ print(f"[DEBUG] NLI: {nli_result['label']} ({nli_result['score']:.3f})")
406
+
407
+ final_label = nli_result["label"]
408
+ final_scores = nli_result["scores"]
409
+ detection_method = "nli"
410
+
411
+ # Estrategia híbrida: usar heurística si NLI neutro/entailment con score alto
412
+ if (nli_result["label"] in ["neutral", "entailment"] and nli_result["score"] > 0.65):
413
+ heuristic = self.detect_contradiction_by_content(s_clean, best_p["text"])
414
+ if heuristic["is_contradiction"]:
415
+ final_label = "contradiction"
416
+ final_scores = {
417
+ "contradiction": heuristic["confidence"],
418
+ "neutral": 1 - heuristic["confidence"],
419
+ "entailment": 0.0,
420
+ }
421
+ detection_method = heuristic["method"]
422
+
423
+ # Fallback: si claim es negación pero NLI devuelve entailment, invertimos
424
+ if is_negation and nli_result["label"] == "entailment":
425
+ final_label = "contradiction"
426
+ final_scores = {
427
+ "contradiction": nli_result["scores"]["entailment"],
428
+ "neutral": nli_result["scores"]["neutral"],
429
+ "entailment": nli_result["scores"]["contradiction"],
430
+ }
431
+ detection_method = "nli_fallback"
432
+
433
+ # Decisión robusta: solo considerar contradicción si supera umbral
434
+ if final_scores["contradiction"] >= self.contradiction_threshold:
435
+ considered_label = "contradiction"
436
+ elif final_scores["entailment"] >= 0.55:
437
+ considered_label = "entailment"
438
+ else:
439
+ considered_label = "neutral"
440
+
441
+ results.append({
442
+ "sentence": s_clean,
443
+ "cleaned_claim": original_claim,
444
+ "transformed_claim": transformed_claim,
445
+ "is_negation": is_negation,
446
+ "best_label": considered_label,
447
+ "best_score": round(max(final_scores.values()) if final_scores else 0.0, 3),
448
+ "detection_method": detection_method,
449
+ "paragraph": (best_p["text"][:200] + "...") if len(best_p["text"]) > 200 else best_p["text"],
450
+ "scores_detail": {
451
+ "contradiction": round(final_scores.get("contradiction", 0.0), 3),
452
+ "neutral": round(final_scores.get("neutral", 0.0), 3),
453
+ "entailment": round(final_scores.get("entailment", 0.0), 3),
454
+ }
455
+ })
456
+
457
+ if considered_label == "contradiction":
458
+ contradictions += 1
459
+
460
+ total = len(results)
461
+ if total == 0:
462
+ decision = "sin suficiente información"
463
+ elif contradictions / total > 0.5:
464
+ decision = "distorsiona gravemente"
465
+ elif contradictions > 0:
466
+ decision = "distorsiona parcialmente"
467
+ else:
468
+ decision = "fiel o parcial"
469
+
470
+ return {
471
+ "decision": decision,
472
+ "contradicciones": contradictions,
473
+ "total_oraciones": total,
474
+ "detalles": results,
475
+ }
src/semantic/relevance.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer, util
2
+ from typing import List, Dict
3
+
4
+ MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
5
+
6
+
7
+ class SemanticRelevance:
8
+ def __init__(self):
9
+ self.model = SentenceTransformer(MODEL_NAME)
10
+
11
+ def relate(self, user_text: str, title: str, paragraphs: List[Dict[str, str]]):
12
+ full_doc = title + ". " + " ".join(p["text"] for p in paragraphs)
13
+
14
+ emb_user = self.model.encode(
15
+ user_text, normalize_embeddings=True, convert_to_tensor=True
16
+ )
17
+ emb_doc = self.model.encode(
18
+ full_doc, normalize_embeddings=True, convert_to_tensor=True
19
+ )
20
+ score_doc = float(util.cos_sim(emb_user, emb_doc)[0][0])
21
+
22
+ per_paragraph = []
23
+ for i, p in enumerate(paragraphs):
24
+ emb_p = self.model.encode(
25
+ p["text"], normalize_embeddings=True, convert_to_tensor=True
26
+ )
27
+ score = float(util.cos_sim(emb_user, emb_p)[0][0])
28
+ per_paragraph.append(
29
+ {
30
+ "index": i,
31
+ "type": p["type"],
32
+ "text": p["text"],
33
+ "score": round(score, 3),
34
+ }
35
+ )
36
+
37
+ if score_doc >= 0.48:
38
+ decision = "muy relacionado"
39
+ elif score_doc >= 0.35:
40
+ decision = "relacionado"
41
+ elif score_doc >= 0.20:
42
+ decision = "tangencial"
43
+ else:
44
+ decision = "no relacionado"
45
+
46
+ best_p = max(per_paragraph, key=lambda x: x["score"])
47
+
48
+ return {
49
+ "decision_document": decision,
50
+ "score_document": round(score_doc, 3),
51
+ "best_paragraph": best_p,
52
+ "per_paragraph": per_paragraph,
53
+ }
src/sesgos/__init__.py ADDED
File without changes
src/sesgos/sesgos.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline
2
+ from typing import List, Dict
3
+
4
+
5
+ class BiasDetector:
6
+ def __init__(self):
7
+ # Modelo para análisis de sentimiento (opcional)
8
+ self.classifier = pipeline(
9
+ "text-classification",
10
+ model="citizenlab/twitter-xlm-roberta-base-sentiment-finetunned",
11
+ device=0,
12
+ top_k=None,
13
+ )
14
+
15
+ # Indicadores heurísticos de sesgos
16
+ self.bias_keywords = {
17
+ "apelación a la autoridad": [
18
+ "según", "de acuerdo con", "informó", "afirmó", "señaló",
19
+ "SESNSP", "gobierno", "autoridades", "expertos dicen", "oficialmente"
20
+ ],
21
+ "generalización apresurada": [
22
+ "todos", "nadie", "siempre", "nunca", "en todos los casos",
23
+ "absolutamente", "todos sabemos", "todo el mundo", "cualquiera sabe",
24
+ "varios de los", # ✅ NUEVO
25
+ ],
26
+ "razonamiento emocional": [
27
+ "terrible", "desastroso", "maravilloso", "increíble",
28
+ "!!", "alarmante", "preocupante extremadamente", "indignante",
29
+ "pura propaganda", "mentiras", "mentira", "obviamente falso",
30
+ "escándalo", "bombo y platillo", "vicios ocultos", # ✅ NUEVO
31
+ "maquillaje", "irónicos comentan", "ironía" # ✅ NUEVO
32
+ ],
33
+ "falso dilema": [
34
+ "solo hay dos opciones", "o esto o aquello",
35
+ "no hay alternativa", "única solución", "o... o..."
36
+ ],
37
+ "ad hominem": [
38
+ "manipulan", "manipula", "mienten", "miente",
39
+ "corruptos", "corrupto", "deshonestos", "deshonesto",
40
+ "inflar las cifras", "para la foto" # ✅ NUEVO
41
+ ]
42
+ }
43
+
44
+
45
+ def detect_heuristic_biases(self, text: str) -> List[Dict]:
46
+ """Detecta sesgos usando keywords y patrones"""
47
+ detected = []
48
+ text_lower = text.lower()
49
+
50
+ for bias_type, keywords in self.bias_keywords.items():
51
+ matches = sum(1 for kw in keywords if kw.lower() in text_lower)
52
+
53
+ if matches >= 2: # Al menos 2 keywords
54
+ confidence = min(0.95, 0.60 + (matches * 0.10))
55
+ detected.append(
56
+ {
57
+ "sesgo": bias_type,
58
+ "confianza": round(confidence, 3),
59
+ "metodo": "heurístico",
60
+ }
61
+ )
62
+
63
+ return detected
64
+
65
+ def analyze_objectivity(self, text: str) -> Dict:
66
+ """
67
+ Analiza si el texto es objetivo o subjetivo.
68
+ """
69
+ # Indicadores de objetividad
70
+ objectivity_markers = [
71
+ "según",
72
+ "de acuerdo con",
73
+ "el informe señala",
74
+ "las cifras muestran",
75
+ "los datos indican",
76
+ "sin embargo",
77
+ "por otro lado",
78
+ "analistas",
79
+ "organizaciones",
80
+ ]
81
+
82
+ # Indicadores de subjetividad
83
+ subjectivity_markers = [
84
+ "obviamente",
85
+ "claramente",
86
+ "sin duda",
87
+ "es evidente que",
88
+ "todos saben",
89
+ "la verdad es",
90
+ ]
91
+
92
+ text_lower = text.lower()
93
+ obj_score = sum(1 for m in objectivity_markers if m in text_lower)
94
+ subj_score = sum(1 for m in subjectivity_markers if m in text_lower)
95
+
96
+ return {
97
+ "objetivo": obj_score,
98
+ "subjetivo": subj_score,
99
+ "ratio": obj_score / max(subj_score, 1),
100
+ }
101
+
102
+ def detect(self, text: str):
103
+ """
104
+ Método principal de detección de sesgos.
105
+ """
106
+ # Análisis heurístico
107
+ heuristic_biases = self.detect_heuristic_biases(text)
108
+
109
+ # Análisis de objetividad
110
+ objectivity = self.analyze_objectivity(text)
111
+
112
+ # Si el texto es muy objetivo, reducir confianza en sesgos
113
+ if objectivity["ratio"] > 3 and len(heuristic_biases) > 0:
114
+ for bias in heuristic_biases:
115
+ bias["confianza"] = max(0.60, bias["confianza"] - 0.20)
116
+
117
+ # Si no hay sesgos detectados
118
+ if not heuristic_biases:
119
+ if objectivity["ratio"] > 2:
120
+ return {
121
+ "sesgos_detectados": [
122
+ {
123
+ "sesgo": "texto objetivo",
124
+ "confianza": 1.0,
125
+ "metodo": "análisis de objetividad",
126
+ }
127
+ ]
128
+ }
129
+ else:
130
+ return {
131
+ "sesgos_detectados": [
132
+ {
133
+ "sesgo": "sin sesgos detectables",
134
+ "confianza": 1.0,
135
+ "metodo": "análisis heurístico",
136
+ }
137
+ ]
138
+ }
139
+
140
+ return {"sesgos_detectados": heuristic_biases}