Spaces:
Sleeping
Sleeping
tessss
Browse files
app.py
CHANGED
|
@@ -80,11 +80,11 @@ SUBJECTS: Dict[str, Dict[str, str]] = {
|
|
| 80 |
}
|
| 81 |
}
|
| 82 |
|
| 83 |
-
# Threshold & parameter cepat
|
| 84 |
TOP_K_FAISS = int(os.environ.get("TOP_K_FAISS", 15))
|
| 85 |
TOP_K_FINAL = int(os.environ.get("TOP_K_FINAL", 10))
|
| 86 |
-
MIN_COSINE = float(os.environ.get("MIN_COSINE", 0.83)) #
|
| 87 |
-
MIN_LEXICAL = float(os.environ.get("MIN_LEXICAL", 0.
|
| 88 |
FALLBACK_TEXT = os.environ.get("FALLBACK_TEXT", "maap pengetahuan tidak ada dalam database")
|
| 89 |
GUARDRAIL_BLOCK_TEXT = os.environ.get("GUARDRAIL_BLOCK_TEXT", "maap, pertanyaan ditolak oleh guardrail")
|
| 90 |
ENABLE_PROFILING = os.environ.get("ENABLE_PROFILING", "false").lower() == "true"
|
|
@@ -125,11 +125,10 @@ TOKEN_RE = re.compile(r"[A-Za-zÀ-ÖØ-öø-ÿ]+", re.UNICODE)
|
|
| 125 |
|
| 126 |
@lru_cache(maxsize=4096)
|
| 127 |
def _tok_cached(word: str) -> str:
|
| 128 |
-
# cache lowercase
|
| 129 |
return word.lower()
|
| 130 |
|
| 131 |
def tok_id(text: str) -> List[str]:
|
| 132 |
-
return [tw for w in TOKEN_RE.findall(text or "") if (tw:=_tok_cached(w)) not in STOPWORDS_ID]
|
| 133 |
|
| 134 |
def lexical_overlap(query: str, sent: str) -> float:
|
| 135 |
q = set(tok_id(query)); s = set(tok_id(sent))
|
|
@@ -138,7 +137,9 @@ def lexical_overlap(query: str, sent: str) -> float:
|
|
| 138 |
return len(q & s) / max(1, len(q | s))
|
| 139 |
|
| 140 |
QUESTION_LIKE_RE = re.compile(r"(^\s*(apa|mengapa|bagaimana|sebutkan|jelaskan)\b|[?]$)", re.IGNORECASE)
|
| 141 |
-
|
|
|
|
|
|
|
| 142 |
META_PREFIX_PATTERNS = [
|
| 143 |
r"berdasarkan\s+(?:kalimat|sumber|teks|konten|informasi)(?:\s+(?:di\s+atas|tersebut))?",
|
| 144 |
r"menurut\s+(?:sumber|teks|konten)",
|
|
@@ -168,14 +169,14 @@ def strip_meta_sentence(s: str) -> str:
|
|
| 168 |
SENT_SPLIT_RE = re.compile(r"(?<=[.!?])\s+")
|
| 169 |
|
| 170 |
def split_sentences_fast(text: str) -> List[str]:
|
| 171 |
-
# tanpa encoding per-kalimat
|
| 172 |
outs = []
|
| 173 |
for p in SENT_SPLIT_RE.split(text or ""):
|
| 174 |
s = clean_prefix((p or "").strip())
|
| 175 |
if not s:
|
| 176 |
continue
|
| 177 |
-
|
| 178 |
-
|
|
|
|
| 179 |
if QUESTION_LIKE_RE.search(s):
|
| 180 |
continue
|
| 181 |
if INSTRUCTION_RE.search(s):
|
|
@@ -186,7 +187,6 @@ def split_sentences_fast(text: str) -> List[str]:
|
|
| 186 |
return outs
|
| 187 |
|
| 188 |
# ========= MODEL WARMUP =========
|
| 189 |
-
|
| 190 |
def warmup_models():
|
| 191 |
global ENCODER_TOKENIZER, ENCODER_MODEL, LLM
|
| 192 |
if ENCODER_TOKENIZER is None or ENCODER_MODEL is None:
|
|
@@ -198,7 +198,6 @@ def warmup_models():
|
|
| 198 |
LLM = load_model(MODEL_PATH, n_ctx=CTX_WINDOW, n_gpu_layers=N_GPU_LAYERS, n_threads=N_THREADS)
|
| 199 |
|
| 200 |
# ========= ASSETS =========
|
| 201 |
-
|
| 202 |
@lru_cache(maxsize=8)
|
| 203 |
def load_subject_assets(subject_key: str) -> "SubjectAssets":
|
| 204 |
if subject_key not in SUBJECTS:
|
|
@@ -220,7 +219,6 @@ def load_subject_assets(subject_key: str) -> "SubjectAssets":
|
|
| 220 |
return SubjectAssets(index=index, texts=texts, embs=embs)
|
| 221 |
|
| 222 |
# ========= ENCODER =========
|
| 223 |
-
|
| 224 |
@torch.inference_mode()
|
| 225 |
@lru_cache(maxsize=1024)
|
| 226 |
def encode_query_exact(text: str) -> np.ndarray:
|
|
@@ -235,7 +233,6 @@ def cosine_sim(a: np.ndarray, b: np.ndarray) -> float:
|
|
| 235 |
return float(np.dot(a, b) / denom)
|
| 236 |
|
| 237 |
# ========= RETRIEVAL CEPAT =========
|
| 238 |
-
|
| 239 |
def best_cosine_from_faiss(query: str, subject_key: str) -> float:
|
| 240 |
assets = load_subject_assets(subject_key)
|
| 241 |
q = encode_query_exact(query)
|
|
@@ -254,24 +251,35 @@ def retrieve_top_chunks(query: str, subject_key: str) -> List[str]:
|
|
| 254 |
idxs = [i for i in idx[0] if 0 <= i < len(assets.texts)]
|
| 255 |
return [assets.texts[i] for i in idxs[:TOP_K_FINAL]]
|
| 256 |
|
|
|
|
| 257 |
def pick_best_sentences_fast(query: str, chunks: List[str], top_k: int = 4) -> List[str]:
|
| 258 |
-
|
|
|
|
|
|
|
|
|
|
| 259 |
cands: List[Tuple[float, str]] = []
|
| 260 |
for ch in chunks:
|
| 261 |
for s in split_sentences_fast(ch):
|
| 262 |
ovl = lexical_overlap(query, s)
|
| 263 |
-
if ovl < MIN_LEXICAL:
|
| 264 |
-
continue
|
| 265 |
-
# bonus sedikit kalau kalimat panjang wajar (50–220 char)
|
| 266 |
L = len(s)
|
| 267 |
len_bonus = 0.05 if 50 <= L <= 220 else 0.0
|
| 268 |
score = ovl + len_bonus
|
| 269 |
-
cands.append((score, s))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 270 |
cands.sort(key=lambda x: x[0], reverse=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 271 |
return [s for _, s in cands[:top_k]]
|
| 272 |
|
| 273 |
# ========= PROMPT =========
|
| 274 |
-
|
| 275 |
def build_prompt(user_query: str, sentences: List[str]) -> str:
|
| 276 |
block = "\n".join(f"- {clean_prefix(s)}" for s in sentences)
|
| 277 |
system = (
|
|
@@ -518,14 +526,18 @@ def ask(subject_key: str):
|
|
| 518 |
best = best_cosine_from_faiss(query, subject_key)
|
| 519 |
log.info(f"[RAG] Subject={subject_key.upper()} | Best cosine={best:.3f}")
|
| 520 |
if best < MIN_COSINE:
|
|
|
|
| 521 |
return jsonify({"ok": True, "answer": FALLBACK_TEXT})
|
| 522 |
|
| 523 |
chunks = retrieve_top_chunks(query, subject_key)
|
| 524 |
if not chunks:
|
|
|
|
| 525 |
return jsonify({"ok": True, "answer": FALLBACK_TEXT})
|
| 526 |
|
| 527 |
sentences = pick_best_sentences_fast(query, chunks, top_k=5)
|
|
|
|
| 528 |
if not sentences:
|
|
|
|
| 529 |
return jsonify({"ok": True, "answer": FALLBACK_TEXT})
|
| 530 |
|
| 531 |
prompt = build_prompt(query, sentences)
|
|
@@ -543,9 +555,10 @@ def ask(subject_key: str):
|
|
| 543 |
raw_answer = raw_answer.strip()
|
| 544 |
log.info(f"[LLM] Raw answer repr (pass1): {repr(raw_answer)}")
|
| 545 |
|
| 546 |
-
|
| 547 |
-
text = re.sub(r"
|
| 548 |
-
|
|
|
|
| 549 |
cleaned = (m_final.group(1).strip() if m_final else re.sub(r"<[^>]+>", "", text).strip())
|
| 550 |
|
| 551 |
def _alpha_tokens(s: str) -> List[str]:
|
|
@@ -555,14 +568,11 @@ def ask(subject_key: str):
|
|
| 555 |
s2 = (s or "").strip()
|
| 556 |
if not s2:
|
| 557 |
return True
|
| 558 |
-
# nolak placeholder/ellipsis saja
|
| 559 |
if s2 in {"...", ".", "..", "…"}:
|
| 560 |
return True
|
| 561 |
toks = _alpha_tokens(s2)
|
| 562 |
-
# cukup 4 token alfabetik untuk lolos (lebih toleran utk jawaban singkat)
|
| 563 |
if len(toks) >= 4:
|
| 564 |
return False
|
| 565 |
-
# pengecualian: fakta pendek dengan unit/istilah umum tetap lolos
|
| 566 |
if any(t.lower() in {"newton","n","kg","m","s"} for t in toks) and len(toks) >= 3:
|
| 567 |
return False
|
| 568 |
return True
|
|
@@ -593,13 +603,13 @@ def ask(subject_key: str):
|
|
| 593 |
cleaned = cleaned2 or cleaned
|
| 594 |
|
| 595 |
answer = cleaned
|
| 596 |
-
|
| 597 |
except Exception as e:
|
| 598 |
log.exception(f"[LLM] generate error: {e}")
|
| 599 |
return jsonify({"ok": True, "answer": FALLBACK_TEXT})
|
| 600 |
|
| 601 |
# Ambil 1 kalimat pertama saja
|
| 602 |
-
m = re.search(r"(.+?[.!?])(
|
| 603 |
answer = (m.group(1) if m else answer).strip()
|
| 604 |
answer = strip_meta_sentence(answer)
|
| 605 |
|
|
@@ -700,7 +710,6 @@ def admin_history():
|
|
| 700 |
} for r in rows]
|
| 701 |
return render_template("admin_history.html", items=items, subjects=SUBJECTS, q=q, username=username, subject=subject, role=role, page=page, per_page=per_page, total=total)
|
| 702 |
|
| 703 |
-
|
| 704 |
def _is_last_admin(s: Session) -> bool:
|
| 705 |
return (s.query(func.count(User.id)).filter(User.is_admin.is_(True)).scalar() or 0) <= 1
|
| 706 |
|
|
|
|
| 80 |
}
|
| 81 |
}
|
| 82 |
|
| 83 |
+
# ======= Threshold & parameter cepat (sudah dilonggarkan & adaptif) =======
|
| 84 |
TOP_K_FAISS = int(os.environ.get("TOP_K_FAISS", 15))
|
| 85 |
TOP_K_FINAL = int(os.environ.get("TOP_K_FINAL", 10))
|
| 86 |
+
MIN_COSINE = float(os.environ.get("MIN_COSINE", 0.83)) # dulu 0.83
|
| 87 |
+
MIN_LEXICAL = float(os.environ.get("MIN_LEXICAL", 0.10)) # dulu 0.8 → terlalu ketat utk query pendek
|
| 88 |
FALLBACK_TEXT = os.environ.get("FALLBACK_TEXT", "maap pengetahuan tidak ada dalam database")
|
| 89 |
GUARDRAIL_BLOCK_TEXT = os.environ.get("GUARDRAIL_BLOCK_TEXT", "maap, pertanyaan ditolak oleh guardrail")
|
| 90 |
ENABLE_PROFILING = os.environ.get("ENABLE_PROFILING", "false").lower() == "true"
|
|
|
|
| 125 |
|
| 126 |
@lru_cache(maxsize=4096)
|
| 127 |
def _tok_cached(word: str) -> str:
|
|
|
|
| 128 |
return word.lower()
|
| 129 |
|
| 130 |
def tok_id(text: str) -> List[str]:
|
| 131 |
+
return [tw for w in TOKEN_RE.findall(text or "") if (tw := _tok_cached(w)) not in STOPWORDS_ID]
|
| 132 |
|
| 133 |
def lexical_overlap(query: str, sent: str) -> float:
|
| 134 |
q = set(tok_id(query)); s = set(tok_id(sent))
|
|
|
|
| 137 |
return len(q & s) / max(1, len(q | s))
|
| 138 |
|
| 139 |
QUESTION_LIKE_RE = re.compile(r"(^\s*(apa|mengapa|bagaimana|sebutkan|jelaskan)\b|[?]$)", re.IGNORECASE)
|
| 140 |
+
# Relaksasi filter instruksi: hanya pola yang benar-benar instruksi tugas di awal kalimat
|
| 141 |
+
INSTRUCTION_RE = re.compile(r"^\s*(kerjakan|tugas\s*:|diskusikan|latihan\s*:)\b", re.IGNORECASE)
|
| 142 |
+
|
| 143 |
META_PREFIX_PATTERNS = [
|
| 144 |
r"berdasarkan\s+(?:kalimat|sumber|teks|konten|informasi)(?:\s+(?:di\s+atas|tersebut))?",
|
| 145 |
r"menurut\s+(?:sumber|teks|konten)",
|
|
|
|
| 169 |
SENT_SPLIT_RE = re.compile(r"(?<=[.!?])\s+")
|
| 170 |
|
| 171 |
def split_sentences_fast(text: str) -> List[str]:
|
|
|
|
| 172 |
outs = []
|
| 173 |
for p in SENT_SPLIT_RE.split(text or ""):
|
| 174 |
s = clean_prefix((p or "").strip())
|
| 175 |
if not s:
|
| 176 |
continue
|
| 177 |
+
# Opsi: jika dataset kamu sering tanpa tanda akhir, boleh aktifkan ini:
|
| 178 |
+
# if s and s[-1] not in ".!?":
|
| 179 |
+
# s += "."
|
| 180 |
if QUESTION_LIKE_RE.search(s):
|
| 181 |
continue
|
| 182 |
if INSTRUCTION_RE.search(s):
|
|
|
|
| 187 |
return outs
|
| 188 |
|
| 189 |
# ========= MODEL WARMUP =========
|
|
|
|
| 190 |
def warmup_models():
|
| 191 |
global ENCODER_TOKENIZER, ENCODER_MODEL, LLM
|
| 192 |
if ENCODER_TOKENIZER is None or ENCODER_MODEL is None:
|
|
|
|
| 198 |
LLM = load_model(MODEL_PATH, n_ctx=CTX_WINDOW, n_gpu_layers=N_GPU_LAYERS, n_threads=N_THREADS)
|
| 199 |
|
| 200 |
# ========= ASSETS =========
|
|
|
|
| 201 |
@lru_cache(maxsize=8)
|
| 202 |
def load_subject_assets(subject_key: str) -> "SubjectAssets":
|
| 203 |
if subject_key not in SUBJECTS:
|
|
|
|
| 219 |
return SubjectAssets(index=index, texts=texts, embs=embs)
|
| 220 |
|
| 221 |
# ========= ENCODER =========
|
|
|
|
| 222 |
@torch.inference_mode()
|
| 223 |
@lru_cache(maxsize=1024)
|
| 224 |
def encode_query_exact(text: str) -> np.ndarray:
|
|
|
|
| 233 |
return float(np.dot(a, b) / denom)
|
| 234 |
|
| 235 |
# ========= RETRIEVAL CEPAT =========
|
|
|
|
| 236 |
def best_cosine_from_faiss(query: str, subject_key: str) -> float:
|
| 237 |
assets = load_subject_assets(subject_key)
|
| 238 |
q = encode_query_exact(query)
|
|
|
|
| 251 |
idxs = [i for i in idx[0] if 0 <= i < len(assets.texts)]
|
| 252 |
return [assets.texts[i] for i in idxs[:TOP_K_FINAL]]
|
| 253 |
|
| 254 |
+
# ======= Seleksi kalimat dua-fase (ketat → longgar) =======
|
| 255 |
def pick_best_sentences_fast(query: str, chunks: List[str], top_k: int = 4) -> List[str]:
|
| 256 |
+
"""
|
| 257 |
+
Fase-1: ambil kalimat dg overlap >= MIN_LEXICAL
|
| 258 |
+
Fase-2 (fallback): kalau hasil < top_k, ambil kalimat skor tertinggi meski < MIN_LEXICAL
|
| 259 |
+
"""
|
| 260 |
cands: List[Tuple[float, str]] = []
|
| 261 |
for ch in chunks:
|
| 262 |
for s in split_sentences_fast(ch):
|
| 263 |
ovl = lexical_overlap(query, s)
|
|
|
|
|
|
|
|
|
|
| 264 |
L = len(s)
|
| 265 |
len_bonus = 0.05 if 50 <= L <= 220 else 0.0
|
| 266 |
score = ovl + len_bonus
|
| 267 |
+
cands.append((score, clean_prefix(s)))
|
| 268 |
+
|
| 269 |
+
if not cands:
|
| 270 |
+
log.info("[RAG] Tidak ada kandidat kalimat (split_sentences menghasilkan 0).")
|
| 271 |
+
return []
|
| 272 |
+
|
| 273 |
cands.sort(key=lambda x: x[0], reverse=True)
|
| 274 |
+
|
| 275 |
+
strict = [s for sc, s in cands if sc + 1e-6 >= MIN_LEXICAL]
|
| 276 |
+
if len(strict) >= top_k:
|
| 277 |
+
return strict[:top_k]
|
| 278 |
+
|
| 279 |
+
log.info(f"[RAG] Kalimat relevan < {top_k} pada MIN_LEXICAL={MIN_LEXICAL}; fallback longgar dipakai.")
|
| 280 |
return [s for _, s in cands[:top_k]]
|
| 281 |
|
| 282 |
# ========= PROMPT =========
|
|
|
|
| 283 |
def build_prompt(user_query: str, sentences: List[str]) -> str:
|
| 284 |
block = "\n".join(f"- {clean_prefix(s)}" for s in sentences)
|
| 285 |
system = (
|
|
|
|
| 526 |
best = best_cosine_from_faiss(query, subject_key)
|
| 527 |
log.info(f"[RAG] Subject={subject_key.upper()} | Best cosine={best:.3f}")
|
| 528 |
if best < MIN_COSINE:
|
| 529 |
+
log.info(f"[RAG] Fallback by cosine: {best:.3f} < {MIN_COSINE}")
|
| 530 |
return jsonify({"ok": True, "answer": FALLBACK_TEXT})
|
| 531 |
|
| 532 |
chunks = retrieve_top_chunks(query, subject_key)
|
| 533 |
if not chunks:
|
| 534 |
+
log.info("[RAG] Fallback by chunks=0")
|
| 535 |
return jsonify({"ok": True, "answer": FALLBACK_TEXT})
|
| 536 |
|
| 537 |
sentences = pick_best_sentences_fast(query, chunks, top_k=5)
|
| 538 |
+
log.info(f"[RAG] sentences_selected={len(sentences)} (min_lex={MIN_LEXICAL}, top_k={5})")
|
| 539 |
if not sentences:
|
| 540 |
+
log.info("[RAG] Fallback by sentences=0")
|
| 541 |
return jsonify({"ok": True, "answer": FALLBACK_TEXT})
|
| 542 |
|
| 543 |
prompt = build_prompt(query, sentences)
|
|
|
|
| 555 |
raw_answer = raw_answer.strip()
|
| 556 |
log.info(f"[LLM] Raw answer repr (pass1): {repr(raw_answer)}")
|
| 557 |
|
| 558 |
+
# Bersihkan tag <think> dan ambil isi <final>
|
| 559 |
+
text = re.sub(r"<think\b[^>]*>.*?</think>", "", raw_answer, flags=re.DOTALL | re.IGNORECASE).strip()
|
| 560 |
+
text = re.sub(r"</?think\b[^>]*>", "", text, flags=re.IGNORECASE).strip()
|
| 561 |
+
m_final = re.search(r"<final>\s*(.+)$", text, flags=re.IGNORECASE | re.DOTALL)
|
| 562 |
cleaned = (m_final.group(1).strip() if m_final else re.sub(r"<[^>]+>", "", text).strip())
|
| 563 |
|
| 564 |
def _alpha_tokens(s: str) -> List[str]:
|
|
|
|
| 568 |
s2 = (s or "").strip()
|
| 569 |
if not s2:
|
| 570 |
return True
|
|
|
|
| 571 |
if s2 in {"...", ".", "..", "…"}:
|
| 572 |
return True
|
| 573 |
toks = _alpha_tokens(s2)
|
|
|
|
| 574 |
if len(toks) >= 4:
|
| 575 |
return False
|
|
|
|
| 576 |
if any(t.lower() in {"newton","n","kg","m","s"} for t in toks) and len(toks) >= 3:
|
| 577 |
return False
|
| 578 |
return True
|
|
|
|
| 603 |
cleaned = cleaned2 or cleaned
|
| 604 |
|
| 605 |
answer = cleaned
|
| 606 |
+
|
| 607 |
except Exception as e:
|
| 608 |
log.exception(f"[LLM] generate error: {e}")
|
| 609 |
return jsonify({"ok": True, "answer": FALLBACK_TEXT})
|
| 610 |
|
| 611 |
# Ambil 1 kalimat pertama saja
|
| 612 |
+
m = re.search(r"(.+?[.!?])(\s|$)", answer)
|
| 613 |
answer = (m.group(1) if m else answer).strip()
|
| 614 |
answer = strip_meta_sentence(answer)
|
| 615 |
|
|
|
|
| 710 |
} for r in rows]
|
| 711 |
return render_template("admin_history.html", items=items, subjects=SUBJECTS, q=q, username=username, subject=subject, role=role, page=page, per_page=per_page, total=total)
|
| 712 |
|
|
|
|
| 713 |
def _is_last_admin(s: Session) -> bool:
|
| 714 |
return (s.query(func.count(User.id)).filter(User.is_admin.is_(True)).scalar() or 0) <= 1
|
| 715 |
|