Dyraa18 commited on
Commit
0d1305c
·
verified ·
1 Parent(s): a211075
Files changed (1) hide show
  1. app.py +37 -28
app.py CHANGED
@@ -80,11 +80,11 @@ SUBJECTS: Dict[str, Dict[str, str]] = {
80
  }
81
  }
82
 
83
- # Threshold & parameter cepat
84
  TOP_K_FAISS = int(os.environ.get("TOP_K_FAISS", 15))
85
  TOP_K_FINAL = int(os.environ.get("TOP_K_FINAL", 10))
86
- MIN_COSINE = float(os.environ.get("MIN_COSINE", 0.83)) # sedikit lebih longgar biar jarang fallback
87
- MIN_LEXICAL = float(os.environ.get("MIN_LEXICAL", 0.8))
88
  FALLBACK_TEXT = os.environ.get("FALLBACK_TEXT", "maap pengetahuan tidak ada dalam database")
89
  GUARDRAIL_BLOCK_TEXT = os.environ.get("GUARDRAIL_BLOCK_TEXT", "maap, pertanyaan ditolak oleh guardrail")
90
  ENABLE_PROFILING = os.environ.get("ENABLE_PROFILING", "false").lower() == "true"
@@ -125,11 +125,10 @@ TOKEN_RE = re.compile(r"[A-Za-zÀ-ÖØ-öø-ÿ]+", re.UNICODE)
125
 
126
  @lru_cache(maxsize=4096)
127
  def _tok_cached(word: str) -> str:
128
- # cache lowercase
129
  return word.lower()
130
 
131
  def tok_id(text: str) -> List[str]:
132
- return [tw for w in TOKEN_RE.findall(text or "") if (tw:=_tok_cached(w)) not in STOPWORDS_ID]
133
 
134
  def lexical_overlap(query: str, sent: str) -> float:
135
  q = set(tok_id(query)); s = set(tok_id(sent))
@@ -138,7 +137,9 @@ def lexical_overlap(query: str, sent: str) -> float:
138
  return len(q & s) / max(1, len(q | s))
139
 
140
  QUESTION_LIKE_RE = re.compile(r"(^\s*(apa|mengapa|bagaimana|sebutkan|jelaskan)\b|[?]$)", re.IGNORECASE)
141
- INSTRUCTION_RE = re.compile(r"\b(jelaskan|sebutkan|uraikan|kerjakan|diskusikan|tugas|latihan|menurut\s+pendapatmu)\b", re.IGNORECASE)
 
 
142
  META_PREFIX_PATTERNS = [
143
  r"berdasarkan\s+(?:kalimat|sumber|teks|konten|informasi)(?:\s+(?:di\s+atas|tersebut))?",
144
  r"menurut\s+(?:sumber|teks|konten)",
@@ -168,14 +169,14 @@ def strip_meta_sentence(s: str) -> str:
168
  SENT_SPLIT_RE = re.compile(r"(?<=[.!?])\s+")
169
 
170
  def split_sentences_fast(text: str) -> List[str]:
171
- # tanpa encoding per-kalimat
172
  outs = []
173
  for p in SENT_SPLIT_RE.split(text or ""):
174
  s = clean_prefix((p or "").strip())
175
  if not s:
176
  continue
177
- if s[-1] not in ".!?":
178
- s += "."
 
179
  if QUESTION_LIKE_RE.search(s):
180
  continue
181
  if INSTRUCTION_RE.search(s):
@@ -186,7 +187,6 @@ def split_sentences_fast(text: str) -> List[str]:
186
  return outs
187
 
188
  # ========= MODEL WARMUP =========
189
-
190
  def warmup_models():
191
  global ENCODER_TOKENIZER, ENCODER_MODEL, LLM
192
  if ENCODER_TOKENIZER is None or ENCODER_MODEL is None:
@@ -198,7 +198,6 @@ def warmup_models():
198
  LLM = load_model(MODEL_PATH, n_ctx=CTX_WINDOW, n_gpu_layers=N_GPU_LAYERS, n_threads=N_THREADS)
199
 
200
  # ========= ASSETS =========
201
-
202
  @lru_cache(maxsize=8)
203
  def load_subject_assets(subject_key: str) -> "SubjectAssets":
204
  if subject_key not in SUBJECTS:
@@ -220,7 +219,6 @@ def load_subject_assets(subject_key: str) -> "SubjectAssets":
220
  return SubjectAssets(index=index, texts=texts, embs=embs)
221
 
222
  # ========= ENCODER =========
223
-
224
  @torch.inference_mode()
225
  @lru_cache(maxsize=1024)
226
  def encode_query_exact(text: str) -> np.ndarray:
@@ -235,7 +233,6 @@ def cosine_sim(a: np.ndarray, b: np.ndarray) -> float:
235
  return float(np.dot(a, b) / denom)
236
 
237
  # ========= RETRIEVAL CEPAT =========
238
-
239
  def best_cosine_from_faiss(query: str, subject_key: str) -> float:
240
  assets = load_subject_assets(subject_key)
241
  q = encode_query_exact(query)
@@ -254,24 +251,35 @@ def retrieve_top_chunks(query: str, subject_key: str) -> List[str]:
254
  idxs = [i for i in idx[0] if 0 <= i < len(assets.texts)]
255
  return [assets.texts[i] for i in idxs[:TOP_K_FINAL]]
256
 
 
257
  def pick_best_sentences_fast(query: str, chunks: List[str], top_k: int = 4) -> List[str]:
258
- # Tanpa encode per kalimat — hanya lexical overlap + panjang wajar
 
 
 
259
  cands: List[Tuple[float, str]] = []
260
  for ch in chunks:
261
  for s in split_sentences_fast(ch):
262
  ovl = lexical_overlap(query, s)
263
- if ovl < MIN_LEXICAL:
264
- continue
265
- # bonus sedikit kalau kalimat panjang wajar (50–220 char)
266
  L = len(s)
267
  len_bonus = 0.05 if 50 <= L <= 220 else 0.0
268
  score = ovl + len_bonus
269
- cands.append((score, s))
 
 
 
 
 
270
  cands.sort(key=lambda x: x[0], reverse=True)
 
 
 
 
 
 
271
  return [s for _, s in cands[:top_k]]
272
 
273
  # ========= PROMPT =========
274
-
275
  def build_prompt(user_query: str, sentences: List[str]) -> str:
276
  block = "\n".join(f"- {clean_prefix(s)}" for s in sentences)
277
  system = (
@@ -518,14 +526,18 @@ def ask(subject_key: str):
518
  best = best_cosine_from_faiss(query, subject_key)
519
  log.info(f"[RAG] Subject={subject_key.upper()} | Best cosine={best:.3f}")
520
  if best < MIN_COSINE:
 
521
  return jsonify({"ok": True, "answer": FALLBACK_TEXT})
522
 
523
  chunks = retrieve_top_chunks(query, subject_key)
524
  if not chunks:
 
525
  return jsonify({"ok": True, "answer": FALLBACK_TEXT})
526
 
527
  sentences = pick_best_sentences_fast(query, chunks, top_k=5)
 
528
  if not sentences:
 
529
  return jsonify({"ok": True, "answer": FALLBACK_TEXT})
530
 
531
  prompt = build_prompt(query, sentences)
@@ -543,9 +555,10 @@ def ask(subject_key: str):
543
  raw_answer = raw_answer.strip()
544
  log.info(f"[LLM] Raw answer repr (pass1): {repr(raw_answer)}")
545
 
546
- text = re.sub(r"<think\\b[^>]*>.*?</think>", "", raw_answer, flags=re.DOTALL | re.IGNORECASE).strip()
547
- text = re.sub(r"</?think\\b[^>]*>", "", text, flags=re.IGNORECASE).strip()
548
- m_final = re.search(r"<final>\\s*(.+)$", text, flags=re.IGNORECASE | re.DOTALL)
 
549
  cleaned = (m_final.group(1).strip() if m_final else re.sub(r"<[^>]+>", "", text).strip())
550
 
551
  def _alpha_tokens(s: str) -> List[str]:
@@ -555,14 +568,11 @@ def ask(subject_key: str):
555
  s2 = (s or "").strip()
556
  if not s2:
557
  return True
558
- # nolak placeholder/ellipsis saja
559
  if s2 in {"...", ".", "..", "…"}:
560
  return True
561
  toks = _alpha_tokens(s2)
562
- # cukup 4 token alfabetik untuk lolos (lebih toleran utk jawaban singkat)
563
  if len(toks) >= 4:
564
  return False
565
- # pengecualian: fakta pendek dengan unit/istilah umum tetap lolos
566
  if any(t.lower() in {"newton","n","kg","m","s"} for t in toks) and len(toks) >= 3:
567
  return False
568
  return True
@@ -593,13 +603,13 @@ def ask(subject_key: str):
593
  cleaned = cleaned2 or cleaned
594
 
595
  answer = cleaned
596
-
597
  except Exception as e:
598
  log.exception(f"[LLM] generate error: {e}")
599
  return jsonify({"ok": True, "answer": FALLBACK_TEXT})
600
 
601
  # Ambil 1 kalimat pertama saja
602
- m = re.search(r"(.+?[.!?])(\\s|$)", answer)
603
  answer = (m.group(1) if m else answer).strip()
604
  answer = strip_meta_sentence(answer)
605
 
@@ -700,7 +710,6 @@ def admin_history():
700
  } for r in rows]
701
  return render_template("admin_history.html", items=items, subjects=SUBJECTS, q=q, username=username, subject=subject, role=role, page=page, per_page=per_page, total=total)
702
 
703
-
704
  def _is_last_admin(s: Session) -> bool:
705
  return (s.query(func.count(User.id)).filter(User.is_admin.is_(True)).scalar() or 0) <= 1
706
 
 
80
  }
81
  }
82
 
83
+ # ======= Threshold & parameter cepat (sudah dilonggarkan & adaptif) =======
84
  TOP_K_FAISS = int(os.environ.get("TOP_K_FAISS", 15))
85
  TOP_K_FINAL = int(os.environ.get("TOP_K_FINAL", 10))
86
+ MIN_COSINE = float(os.environ.get("MIN_COSINE", 0.83)) # dulu 0.83
87
+ MIN_LEXICAL = float(os.environ.get("MIN_LEXICAL", 0.10)) # dulu 0.8 → terlalu ketat utk query pendek
88
  FALLBACK_TEXT = os.environ.get("FALLBACK_TEXT", "maap pengetahuan tidak ada dalam database")
89
  GUARDRAIL_BLOCK_TEXT = os.environ.get("GUARDRAIL_BLOCK_TEXT", "maap, pertanyaan ditolak oleh guardrail")
90
  ENABLE_PROFILING = os.environ.get("ENABLE_PROFILING", "false").lower() == "true"
 
125
 
126
  @lru_cache(maxsize=4096)
127
  def _tok_cached(word: str) -> str:
 
128
  return word.lower()
129
 
130
  def tok_id(text: str) -> List[str]:
131
+ return [tw for w in TOKEN_RE.findall(text or "") if (tw := _tok_cached(w)) not in STOPWORDS_ID]
132
 
133
  def lexical_overlap(query: str, sent: str) -> float:
134
  q = set(tok_id(query)); s = set(tok_id(sent))
 
137
  return len(q & s) / max(1, len(q | s))
138
 
139
  QUESTION_LIKE_RE = re.compile(r"(^\s*(apa|mengapa|bagaimana|sebutkan|jelaskan)\b|[?]$)", re.IGNORECASE)
140
+ # Relaksasi filter instruksi: hanya pola yang benar-benar instruksi tugas di awal kalimat
141
+ INSTRUCTION_RE = re.compile(r"^\s*(kerjakan|tugas\s*:|diskusikan|latihan\s*:)\b", re.IGNORECASE)
142
+
143
  META_PREFIX_PATTERNS = [
144
  r"berdasarkan\s+(?:kalimat|sumber|teks|konten|informasi)(?:\s+(?:di\s+atas|tersebut))?",
145
  r"menurut\s+(?:sumber|teks|konten)",
 
169
  SENT_SPLIT_RE = re.compile(r"(?<=[.!?])\s+")
170
 
171
  def split_sentences_fast(text: str) -> List[str]:
 
172
  outs = []
173
  for p in SENT_SPLIT_RE.split(text or ""):
174
  s = clean_prefix((p or "").strip())
175
  if not s:
176
  continue
177
+ # Opsi: jika dataset kamu sering tanpa tanda akhir, boleh aktifkan ini:
178
+ # if s and s[-1] not in ".!?":
179
+ # s += "."
180
  if QUESTION_LIKE_RE.search(s):
181
  continue
182
  if INSTRUCTION_RE.search(s):
 
187
  return outs
188
 
189
  # ========= MODEL WARMUP =========
 
190
  def warmup_models():
191
  global ENCODER_TOKENIZER, ENCODER_MODEL, LLM
192
  if ENCODER_TOKENIZER is None or ENCODER_MODEL is None:
 
198
  LLM = load_model(MODEL_PATH, n_ctx=CTX_WINDOW, n_gpu_layers=N_GPU_LAYERS, n_threads=N_THREADS)
199
 
200
  # ========= ASSETS =========
 
201
  @lru_cache(maxsize=8)
202
  def load_subject_assets(subject_key: str) -> "SubjectAssets":
203
  if subject_key not in SUBJECTS:
 
219
  return SubjectAssets(index=index, texts=texts, embs=embs)
220
 
221
  # ========= ENCODER =========
 
222
  @torch.inference_mode()
223
  @lru_cache(maxsize=1024)
224
  def encode_query_exact(text: str) -> np.ndarray:
 
233
  return float(np.dot(a, b) / denom)
234
 
235
  # ========= RETRIEVAL CEPAT =========
 
236
  def best_cosine_from_faiss(query: str, subject_key: str) -> float:
237
  assets = load_subject_assets(subject_key)
238
  q = encode_query_exact(query)
 
251
  idxs = [i for i in idx[0] if 0 <= i < len(assets.texts)]
252
  return [assets.texts[i] for i in idxs[:TOP_K_FINAL]]
253
 
254
+ # ======= Seleksi kalimat dua-fase (ketat → longgar) =======
255
  def pick_best_sentences_fast(query: str, chunks: List[str], top_k: int = 4) -> List[str]:
256
+ """
257
+ Fase-1: ambil kalimat dg overlap >= MIN_LEXICAL
258
+ Fase-2 (fallback): kalau hasil < top_k, ambil kalimat skor tertinggi meski < MIN_LEXICAL
259
+ """
260
  cands: List[Tuple[float, str]] = []
261
  for ch in chunks:
262
  for s in split_sentences_fast(ch):
263
  ovl = lexical_overlap(query, s)
 
 
 
264
  L = len(s)
265
  len_bonus = 0.05 if 50 <= L <= 220 else 0.0
266
  score = ovl + len_bonus
267
+ cands.append((score, clean_prefix(s)))
268
+
269
+ if not cands:
270
+ log.info("[RAG] Tidak ada kandidat kalimat (split_sentences menghasilkan 0).")
271
+ return []
272
+
273
  cands.sort(key=lambda x: x[0], reverse=True)
274
+
275
+ strict = [s for sc, s in cands if sc + 1e-6 >= MIN_LEXICAL]
276
+ if len(strict) >= top_k:
277
+ return strict[:top_k]
278
+
279
+ log.info(f"[RAG] Kalimat relevan < {top_k} pada MIN_LEXICAL={MIN_LEXICAL}; fallback longgar dipakai.")
280
  return [s for _, s in cands[:top_k]]
281
 
282
  # ========= PROMPT =========
 
283
  def build_prompt(user_query: str, sentences: List[str]) -> str:
284
  block = "\n".join(f"- {clean_prefix(s)}" for s in sentences)
285
  system = (
 
526
  best = best_cosine_from_faiss(query, subject_key)
527
  log.info(f"[RAG] Subject={subject_key.upper()} | Best cosine={best:.3f}")
528
  if best < MIN_COSINE:
529
+ log.info(f"[RAG] Fallback by cosine: {best:.3f} < {MIN_COSINE}")
530
  return jsonify({"ok": True, "answer": FALLBACK_TEXT})
531
 
532
  chunks = retrieve_top_chunks(query, subject_key)
533
  if not chunks:
534
+ log.info("[RAG] Fallback by chunks=0")
535
  return jsonify({"ok": True, "answer": FALLBACK_TEXT})
536
 
537
  sentences = pick_best_sentences_fast(query, chunks, top_k=5)
538
+ log.info(f"[RAG] sentences_selected={len(sentences)} (min_lex={MIN_LEXICAL}, top_k={5})")
539
  if not sentences:
540
+ log.info("[RAG] Fallback by sentences=0")
541
  return jsonify({"ok": True, "answer": FALLBACK_TEXT})
542
 
543
  prompt = build_prompt(query, sentences)
 
555
  raw_answer = raw_answer.strip()
556
  log.info(f"[LLM] Raw answer repr (pass1): {repr(raw_answer)}")
557
 
558
+ # Bersihkan tag <think> dan ambil isi <final>
559
+ text = re.sub(r"<think\b[^>]*>.*?</think>", "", raw_answer, flags=re.DOTALL | re.IGNORECASE).strip()
560
+ text = re.sub(r"</?think\b[^>]*>", "", text, flags=re.IGNORECASE).strip()
561
+ m_final = re.search(r"<final>\s*(.+)$", text, flags=re.IGNORECASE | re.DOTALL)
562
  cleaned = (m_final.group(1).strip() if m_final else re.sub(r"<[^>]+>", "", text).strip())
563
 
564
  def _alpha_tokens(s: str) -> List[str]:
 
568
  s2 = (s or "").strip()
569
  if not s2:
570
  return True
 
571
  if s2 in {"...", ".", "..", "…"}:
572
  return True
573
  toks = _alpha_tokens(s2)
 
574
  if len(toks) >= 4:
575
  return False
 
576
  if any(t.lower() in {"newton","n","kg","m","s"} for t in toks) and len(toks) >= 3:
577
  return False
578
  return True
 
603
  cleaned = cleaned2 or cleaned
604
 
605
  answer = cleaned
606
+
607
  except Exception as e:
608
  log.exception(f"[LLM] generate error: {e}")
609
  return jsonify({"ok": True, "answer": FALLBACK_TEXT})
610
 
611
  # Ambil 1 kalimat pertama saja
612
+ m = re.search(r"(.+?[.!?])(\s|$)", answer)
613
  answer = (m.group(1) if m else answer).strip()
614
  answer = strip_meta_sentence(answer)
615
 
 
710
  } for r in rows]
711
  return render_template("admin_history.html", items=items, subjects=SUBJECTS, q=q, username=username, subject=subject, role=role, page=page, per_page=per_page, total=total)
712
 
 
713
  def _is_last_admin(s: Session) -> bool:
714
  return (s.query(func.count(User.id)).filter(User.is_admin.is_(True)).scalar() or 0) <= 1
715