Spaces:

Hokeno
/

NERIndo

Sleeping

App Files Files Community

Hokeno commited on Jul 18

Commit

3ac3892

verified ·

1 Parent(s): 254a4f1

Upload 9 files

Browse files

Files changed (9) hide show

app.py +148 -0
main4.ipynb +1188 -0
ner_model/config.json +50 -0
ner_model/model.safetensors +3 -0
ner_model/special_tokens_map.json +7 -0
ner_model/tokenizer.json +0 -0
ner_model/tokenizer_config.json +58 -0
ner_model/vocab.txt +0 -0
requirements.txt +6 -0

app.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import os
+import sys
+import subprocess
+import numpy as np
+from datasets import load_dataset
+from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification
+import torch
+import gradio as gr
+import pandas as pd
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print(f"Menggunakan perangkat: {device}")
+# Load dataset to get label list
+try:
+    dataset = load_dataset("indonlp/indonlu", "nergrit", trust_remote_code=True)
+except Exception as e:
+    print(f"Gagal memuat dataset: {e}")
+    sys.exit(1)
+# Verify dataset structure
+if "train" not in dataset or "test" not in dataset:
+    print("Dataset tidak memiliki split train/test yang diharapkan.")
+    sys.exit(1)
+if "tokens" not in dataset["train"].column_names or "ner_tags" not in dataset["train"].column_names:
+    print("Dataset tidak memiliki kolom 'tokens' atau 'ner_tags'.")
+    sys.exit(1)
+# Define label list
+try:
+    label_list = dataset["train"].features["ner_tags"].feature.names
+    id2label = {i: label for i, label in enumerate(label_list)}
+    label2id = {label: i for i, label in enumerate(label_list)}
+except Exception as e:
+    print(f"Gagal mendapatkan label: {e}")
+    sys.exit(1)
+# Load tokenizer and model from saved directory
+try:
+    tokenizer = AutoTokenizer.from_pretrained("./ner_model")
+    model = AutoModelForTokenClassification.from_pretrained(
+        "./ner_model",
+        num_labels=len(label_list),
+        id2label=id2label,
+        label2id=label2id
+    )
+    model.to(device)
+except Exception as e:
+    print(f"Gagal memuat model atau tokenizer dari './ner_model': {e}")
+    print("Pastikan folder './ner_model' ada dan berisi model yang telah dilatih.")
+    sys.exit(1)
+# Tokenize and align labels for test data
+def tokenize_and_align_labels(examples):
+    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
+    labels = []
+    for i, label in enumerate(examples["ner_tags"]):
+        word_ids = tokenized_inputs.word_ids(batch_index=i)
+        previous_word_idx = None
+        label_ids = []
+        for word_idx in word_ids:
+            if word_idx is None:
+                label_ids.append(-100)
+            elif word_idx != previous_word_idx:
+                label_ids.append(label[word_idx])
+            else:
+                label_ids.append(-100)
+            previous_word_idx = word_idx
+        labels.append(label_ids)
+    tokenized_inputs["labels"] = labels
+    return tokenized_inputs
+# Tokenize test dataset
+try:
+    tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)
+except Exception as e:
+    print(f"Gagal menokenisasi dataset: {e}")
+    sys.exit(1)
+# Function to predict entities for input text
+def predict_entities(input_text):
+    if not input_text.strip():
+        return "Masukkan teks untuk diprediksi."
+    # Tokenize input text
+    inputs = tokenizer(input_text, return_tensors="pt", truncation=True)
+    input_ids = inputs["input_ids"].to(device)
+    attention_mask = inputs["attention_mask"].to(device)
+    # Predict
+    model.eval()
+    with torch.no_grad():
+        outputs = model(input_ids, attention_mask=attention_mask)
+    predictions = outputs.logits.argmax(dim=2)[0].cpu().numpy()
+    # Get tokens and predicted labels
+    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
+    labels = [id2label[pred] for pred in predictions]
+    # Remove special tokens ([CLS], [SEP]) and align
+    result = []
+    for token, label in zip(tokens, labels):
+        if token not in ["[CLS]", "[SEP]"]:
+            result.append({"Token": token, "Entity": label})
+    # Convert to DataFrame for display
+    return pd.DataFrame(result)
+# Gradio interface
+with gr.Blocks() as demo:
+    gr.Markdown("# Named Entity Recognition (NER) dengan IndoBERT")
+    gr.Markdown("Masukkan teks dalam bahasa Indonesia untuk mendeteksi entitas seperti PERSON, ORGANISATION, PLACE, dll.")
+    gr.Markdown("## Keterangan Label Entitas")
+    gr.Markdown("""
+    - **O**: Token bukan entitas (contoh: "dan", "mengunjungi").
+    - **B-PERSON**: Awal nama orang (contoh: "Joko" dalam "Joko Widodo").
+    - **I-PERSON**: Lanjutan nama orang (contoh: "Widodo" atau "##do" dalam "Joko Widodo").
+    - **B-PLACE**: Awal nama tempat (contoh: "Bali").
+    - **I-PLACE**: Lanjutan nama tempat (contoh: "Indonesia" dalam "Bali, Indonesia").
+    """)
+    with gr.Row():
+        text_input = gr.Textbox(
+            label="Masukkan Teks",
+            placeholder="Contoh: Joko Widodo menghadiri acara di Universitas Indonesia pada tanggal 14 Juni 2025",
+            lines=3
+        )
+        submit_button = gr.Button("Prediksi")
+        clear_button = gr.Button("Bersihkan")
+    output_table = gr.Dataframe(label="Hasil Prediksi")
+    gr.Markdown("## Contoh Teks")
+    gr.Markdown("- SBY berkunjung ke Bali bersama Jokowi.\n- Universitas Gadjah Mada menyelenggarakan seminar pada 10 Maret 2025.")
+    gr.Markdown("## Pertimbangan Keamanan Data, Privasi, dan Etika")
+    gr.Markdown("""
+    - **Keamanan Data**: Dataset bersumber dari berita publik, tidak mengandung informasi sensitif seperti alamat atau nomor identitas.
+    - **Privasi**: Input pengguna tidak disimpan, menjaga privasi.
+    - **Etika AI**: Dataset mencakup berbagai topik berita (politik, olahraga, budaya), mengurangi risiko bias terhadap entitas tertentu.
+    """)
+    submit_button.click(fn=predict_entities, inputs=text_input, outputs=output_table)
+    clear_button.click(fn=lambda: "", inputs=None, outputs=text_input)
+# Launch Gradio interface
+demo.launch()

main4.ipynb ADDED Viewed

	@@ -0,0 +1,1188 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2a409dd5",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "WARNING:tensorflow:From d:\\Anaconda\\Lib\\site-packages\\tf_keras\\src\\losses.py:2976: The name tf.losses.sparse_softmax_cross_entropy is deprecated. Please use tf.compat.v1.losses.sparse_softmax_cross_entropy instead.\n",
+      "\n",
+      "Menggunakan perangkat: cuda\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[I 2025-07-18 06:26:20,055] A new study created in memory with name: no-name-50af0249-7af4-476f-988c-7342adeab58c\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Memulai hyperparameter tuning dengan Optuna...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of BertForTokenClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+      "C:\\Users\\BUDI\\AppData\\Local\\Temp\\ipykernel_6152\\2584540621.py:147: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Trainer.__init__`. Use `processing_class` instead.\n",
+      "  trainer = Trainer(\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "      \n",
+       "      <progress value='836' max='836' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [836/836 03:00, Epoch 4/4]\n",
+       "    </div>\n",
+       "    <table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       " <tr style=\"text-align: left;\">\n",
+       "      <th>Epoch</th>\n",
+       "      <th>Training Loss</th>\n",
+       "      <th>Validation Loss</th>\n",
+       "      <th>Precision</th>\n",
+       "      <th>Recall</th>\n",
+       "      <th>F1</th>\n",
+       "      <th>Accuracy</th>\n",
+       "      <th>Per Entity</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>1</td>\n",
+       "      <td>0.124700</td>\n",
+       "      <td>0.166868</td>\n",
+       "      <td>0.748068</td>\n",
+       "      <td>0.731118</td>\n",
+       "      <td>0.739496</td>\n",
+       "      <td>0.945582</td>\n",
+       "      <td>{}</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2</td>\n",
+       "      <td>0.103800</td>\n",
+       "      <td>0.157893</td>\n",
+       "      <td>0.750355</td>\n",
+       "      <td>0.799094</td>\n",
+       "      <td>0.773958</td>\n",
+       "      <td>0.952456</td>\n",
+       "      <td>{}</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3</td>\n",
+       "      <td>0.096100</td>\n",
+       "      <td>0.171932</td>\n",
+       "      <td>0.800613</td>\n",
+       "      <td>0.788520</td>\n",
+       "      <td>0.794521</td>\n",
+       "      <td>0.955606</td>\n",
+       "      <td>{}</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4</td>\n",
+       "      <td>0.032800</td>\n",
+       "      <td>0.178615</td>\n",
+       "      <td>0.750704</td>\n",
+       "      <td>0.805136</td>\n",
+       "      <td>0.776968</td>\n",
+       "      <td>0.954031</td>\n",
+       "      <td>{}</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table><p>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval_per_entity\" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.\n",
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval/per_entity\" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.\n",
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval_per_entity\" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.\n",
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval/per_entity\" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.\n",
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval_per_entity\" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.\n",
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval/per_entity\" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.\n",
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval_per_entity\" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.\n",
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval/per_entity\" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "      \n",
+       "      <progress value='27' max='27' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [27/27 00:01]\n",
+       "    </div>\n",
+       "    "
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval_per_entity\" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.\n",
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval/per_entity\" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.\n",
+      "[I 2025-07-18 06:29:29,091] Trial 0 finished with value: 0.7945205479452055 and parameters: {'learning_rate': 2.3555847899573657e-05, 'batch_size': 8, 'num_epochs': 4}. Best is trial 0 with value: 0.7945205479452055.\n",
+      "Some weights of BertForTokenClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+      "C:\\Users\\BUDI\\AppData\\Local\\Temp\\ipykernel_6152\\2584540621.py:147: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Trainer.__init__`. Use `processing_class` instead.\n",
+      "  trainer = Trainer(\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "      \n",
+       "      <progress value='1045' max='1045' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [1045/1045 04:05, Epoch 5/5]\n",
+       "    </div>\n",
+       "    <table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       " <tr style=\"text-align: left;\">\n",
+       "      <th>Epoch</th>\n",
+       "      <th>Training Loss</th>\n",
+       "      <th>Validation Loss</th>\n",
+       "      <th>Precision</th>\n",
+       "      <th>Recall</th>\n",
+       "      <th>F1</th>\n",
+       "      <th>Accuracy</th>\n",
+       "      <th>Per Entity</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>1</td>\n",
+       "      <td>0.123500</td>\n",
+       "      <td>0.163488</td>\n",
+       "      <td>0.728788</td>\n",
+       "      <td>0.726586</td>\n",
+       "      <td>0.727685</td>\n",
+       "      <td>0.945009</td>\n",
+       "      <td>{}</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2</td>\n",
+       "      <td>0.108800</td>\n",
+       "      <td>0.155614</td>\n",
+       "      <td>0.737346</td>\n",
+       "      <td>0.814199</td>\n",
+       "      <td>0.773869</td>\n",
+       "      <td>0.953745</td>\n",
+       "      <td>{}</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3</td>\n",
+       "      <td>0.110300</td>\n",
+       "      <td>0.170470</td>\n",
+       "      <td>0.763314</td>\n",
+       "      <td>0.779456</td>\n",
+       "      <td>0.771300</td>\n",
+       "      <td>0.953172</td>\n",
+       "      <td>{}</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4</td>\n",
+       "      <td>0.045800</td>\n",
+       "      <td>0.182373</td>\n",
+       "      <td>0.765557</td>\n",
+       "      <td>0.799094</td>\n",
+       "      <td>0.781966</td>\n",
+       "      <td>0.954031</td>\n",
+       "      <td>{}</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5</td>\n",
+       "      <td>0.022400</td>\n",
+       "      <td>0.191159</td>\n",
+       "      <td>0.758571</td>\n",
+       "      <td>0.802115</td>\n",
+       "      <td>0.779736</td>\n",
+       "      <td>0.953315</td>\n",
+       "      <td>{}</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table><p>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval_per_entity\" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.\n",
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval/per_entity\" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.\n",
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval_per_entity\" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.\n",
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval/per_entity\" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.\n",
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval_per_entity\" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.\n",
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval/per_entity\" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.\n",
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval_per_entity\" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.\n",
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval/per_entity\" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.\n",
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval_per_entity\" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.\n",
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval/per_entity\" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "      \n",
+       "      <progress value='27' max='27' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [27/27 00:01]\n",
+       "    </div>\n",
+       "    "
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval_per_entity\" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.\n",
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval/per_entity\" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.\n",
+      "[I 2025-07-18 06:33:40,086] Trial 1 finished with value: 0.7819660014781965 and parameters: {'learning_rate': 1.7904807706862636e-05, 'batch_size': 8, 'num_epochs': 5}. Best is trial 0 with value: 0.7945205479452055.\n",
+      "Some weights of BertForTokenClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+      "C:\\Users\\BUDI\\AppData\\Local\\Temp\\ipykernel_6152\\2584540621.py:147: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Trainer.__init__`. Use `processing_class` instead.\n",
+      "  trainer = Trainer(\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "      \n",
+       "      <progress value='420' max='420' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [420/420 05:47, Epoch 4/4]\n",
+       "    </div>\n",
+       "    <table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       " <tr style=\"text-align: left;\">\n",
+       "      <th>Epoch</th>\n",
+       "      <th>Training Loss</th>\n",
+       "      <th>Validation Loss</th>\n",
+       "      <th>Precision</th>\n",
+       "      <th>Recall</th>\n",
+       "      <th>F1</th>\n",
+       "      <th>Accuracy</th>\n",
+       "      <th>Per Entity</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>1</td>\n",
+       "      <td>0.138600</td>\n",
+       "      <td>0.185550</td>\n",
+       "      <td>0.738769</td>\n",
+       "      <td>0.670695</td>\n",
+       "      <td>0.703088</td>\n",
+       "      <td>0.942432</td>\n",
+       "      <td>{}</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2</td>\n",
+       "      <td>0.109800</td>\n",
+       "      <td>0.154619</td>\n",
+       "      <td>0.781899</td>\n",
+       "      <td>0.796073</td>\n",
+       "      <td>0.788922</td>\n",
+       "      <td>0.955463</td>\n",
+       "      <td>{}</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3</td>\n",
+       "      <td>0.069800</td>\n",
+       "      <td>0.155078</td>\n",
+       "      <td>0.807750</td>\n",
+       "      <td>0.818731</td>\n",
+       "      <td>0.813203</td>\n",
+       "      <td>0.960332</td>\n",
+       "      <td>{}</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4</td>\n",
+       "      <td>0.027200</td>\n",
+       "      <td>0.174292</td>\n",
+       "      <td>0.765292</td>\n",
+       "      <td>0.812689</td>\n",
+       "      <td>0.788278</td>\n",
+       "      <td>0.954747</td>\n",
+       "      <td>{}</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table><p>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval_per_entity\" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.\n",
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval/per_entity\" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.\n",
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval_per_entity\" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.\n",
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval/per_entity\" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.\n",
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval_per_entity\" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.\n",
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval/per_entity\" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.\n",
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval_per_entity\" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.\n",
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval/per_entity\" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "      \n",
+       "      <progress value='14' max='14' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [14/14 00:00]\n",
+       "    </div>\n",
+       "    "
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval_per_entity\" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.\n",
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval/per_entity\" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.\n",
+      "[I 2025-07-18 06:39:32,835] Trial 2 finished with value: 0.8132033008252062 and parameters: {'learning_rate': 3.672145523121866e-05, 'batch_size': 16, 'num_epochs': 4}. Best is trial 2 with value: 0.8132033008252062.\n",
+      "Some weights of BertForTokenClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+      "C:\\Users\\BUDI\\AppData\\Local\\Temp\\ipykernel_6152\\2584540621.py:147: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Trainer.__init__`. Use `processing_class` instead.\n",
+      "  trainer = Trainer(\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "      \n",
+       "      <progress value='525' max='525' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [525/525 07:42, Epoch 5/5]\n",
+       "    </div>\n",
+       "    <table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       " <tr style=\"text-align: left;\">\n",
+       "      <th>Epoch</th>\n",
+       "      <th>Training Loss</th>\n",
+       "      <th>Validation Loss</th>\n",
+       "      <th>Precision</th>\n",
+       "      <th>Recall</th>\n",
+       "      <th>F1</th>\n",
+       "      <th>Accuracy</th>\n",
+       "      <th>Per Entity</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>1</td>\n",
+       "      <td>0.143200</td>\n",
+       "      <td>0.170970</td>\n",
+       "      <td>0.745514</td>\n",
+       "      <td>0.690332</td>\n",
+       "      <td>0.716863</td>\n",
+       "      <td>0.945869</td>\n",
+       "      <td>{}</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2</td>\n",
+       "      <td>0.107300</td>\n",
+       "      <td>0.154406</td>\n",
+       "      <td>0.766141</td>\n",
+       "      <td>0.806647</td>\n",
+       "      <td>0.785872</td>\n",
+       "      <td>0.953029</td>\n",
+       "      <td>{}</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3</td>\n",
+       "      <td>0.075100</td>\n",
+       "      <td>0.158503</td>\n",
+       "      <td>0.795420</td>\n",
+       "      <td>0.787009</td>\n",
+       "      <td>0.791192</td>\n",
+       "      <td>0.956895</td>\n",
+       "      <td>{}</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4</td>\n",
+       "      <td>0.025800</td>\n",
+       "      <td>0.179348</td>\n",
+       "      <td>0.764791</td>\n",
+       "      <td>0.800604</td>\n",
+       "      <td>0.782288</td>\n",
+       "      <td>0.954461</td>\n",
+       "      <td>{}</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5</td>\n",
+       "      <td>0.013400</td>\n",
+       "      <td>0.185257</td>\n",
+       "      <td>0.766049</td>\n",
+       "      <td>0.811178</td>\n",
+       "      <td>0.787968</td>\n",
+       "      <td>0.953888</td>\n",
+       "      <td>{}</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table><p>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval_per_entity\" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.\n",
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval/per_entity\" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.\n",
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval_per_entity\" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.\n",
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval/per_entity\" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.\n",
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval_per_entity\" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.\n",
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval/per_entity\" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.\n",
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval_per_entity\" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.\n",
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval/per_entity\" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.\n",
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval_per_entity\" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.\n",
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval/per_entity\" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "      \n",
+       "      <progress value='14' max='14' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [14/14 00:01]\n",
+       "    </div>\n",
+       "    "
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval_per_entity\" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.\n",
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval/per_entity\" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.\n",
+      "[I 2025-07-18 06:47:22,280] Trial 3 finished with value: 0.7911921032649962 and parameters: {'learning_rate': 3.713773945286763e-05, 'batch_size': 16, 'num_epochs': 5}. Best is trial 2 with value: 0.8132033008252062.\n",
+      "Some weights of BertForTokenClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+      "C:\\Users\\BUDI\\AppData\\Local\\Temp\\ipykernel_6152\\2584540621.py:147: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Trainer.__init__`. Use `processing_class` instead.\n",
+      "  trainer = Trainer(\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "      \n",
+       "      <progress value='1045' max='1045' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [1045/1045 04:30, Epoch 5/5]\n",
+       "    </div>\n",
+       "    <table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       " <tr style=\"text-align: left;\">\n",
+       "      <th>Epoch</th>\n",
+       "      <th>Training Loss</th>\n",
+       "      <th>Validation Loss</th>\n",
+       "      <th>Precision</th>\n",
+       "      <th>Recall</th>\n",
+       "      <th>F1</th>\n",
+       "      <th>Accuracy</th>\n",
+       "      <th>Per Entity</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>1</td>\n",
+       "      <td>0.132700</td>\n",
+       "      <td>0.169205</td>\n",
+       "      <td>0.715361</td>\n",
+       "      <td>0.717523</td>\n",
+       "      <td>0.716440</td>\n",
+       "      <td>0.944007</td>\n",
+       "      <td>{}</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2</td>\n",
+       "      <td>0.120000</td>\n",
+       "      <td>0.155390</td>\n",
+       "      <td>0.750700</td>\n",
+       "      <td>0.809668</td>\n",
+       "      <td>0.779070</td>\n",
+       "      <td>0.953458</td>\n",
+       "      <td>{}</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3</td>\n",
+       "      <td>0.136600</td>\n",
+       "      <td>0.163555</td>\n",
+       "      <td>0.761974</td>\n",
+       "      <td>0.793051</td>\n",
+       "      <td>0.777202</td>\n",
+       "      <td>0.954174</td>\n",
+       "      <td>{}</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4</td>\n",
+       "      <td>0.067900</td>\n",
+       "      <td>0.172124</td>\n",
+       "      <td>0.766476</td>\n",
+       "      <td>0.808157</td>\n",
+       "      <td>0.786765</td>\n",
+       "      <td>0.953888</td>\n",
+       "      <td>{}</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5</td>\n",
+       "      <td>0.035200</td>\n",
+       "      <td>0.180249</td>\n",
+       "      <td>0.759943</td>\n",
+       "      <td>0.808157</td>\n",
+       "      <td>0.783309</td>\n",
+       "      <td>0.953745</td>\n",
+       "      <td>{}</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table><p>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval_per_entity\" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.\n",
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval/per_entity\" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.\n",
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval_per_entity\" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.\n",
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval/per_entity\" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.\n",
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval_per_entity\" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.\n",
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval/per_entity\" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.\n",
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval_per_entity\" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.\n",
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval/per_entity\" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.\n",
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval_per_entity\" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.\n",
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval/per_entity\" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "      \n",
+       "      <progress value='27' max='27' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [27/27 00:01]\n",
+       "    </div>\n",
+       "    "
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval_per_entity\" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.\n",
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval/per_entity\" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.\n",
+      "[I 2025-07-18 06:51:59,633] Trial 4 finished with value: 0.7867647058823529 and parameters: {'learning_rate': 1.1923156920458335e-05, 'batch_size': 8, 'num_epochs': 5}. Best is trial 2 with value: 0.8132033008252062.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Hyperparameter terbaik:\n",
+      "{'learning_rate': 3.672145523121866e-05, 'batch_size': 16, 'num_epochs': 4}\n",
+      "F1-Score terbaik: 0.8132\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of BertForTokenClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+      "C:\\Users\\BUDI\\AppData\\Local\\Temp\\ipykernel_6152\\2584540621.py:195: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Trainer.__init__`. Use `processing_class` instead.\n",
+      "  trainer = Trainer(\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Memulai pelatihan dengan hyperparameter terbaik...\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "      \n",
+       "      <progress value='420' max='420' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [420/420 07:01, Epoch 4/4]\n",
+       "    </div>\n",
+       "    <table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       " <tr style=\"text-align: left;\">\n",
+       "      <th>Epoch</th>\n",
+       "      <th>Training Loss</th>\n",
+       "      <th>Validation Loss</th>\n",
+       "      <th>Precision</th>\n",
+       "      <th>Recall</th>\n",
+       "      <th>F1</th>\n",
+       "      <th>Accuracy</th>\n",
+       "      <th>Per Entity</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>1</td>\n",
+       "      <td>0.138600</td>\n",
+       "      <td>0.185550</td>\n",
+       "      <td>0.738769</td>\n",
+       "      <td>0.670695</td>\n",
+       "      <td>0.703088</td>\n",
+       "      <td>0.942432</td>\n",
+       "      <td>{}</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2</td>\n",
+       "      <td>0.109800</td>\n",
+       "      <td>0.154619</td>\n",
+       "      <td>0.781899</td>\n",
+       "      <td>0.796073</td>\n",
+       "      <td>0.788922</td>\n",
+       "      <td>0.955463</td>\n",
+       "      <td>{}</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3</td>\n",
+       "      <td>0.069800</td>\n",
+       "      <td>0.155078</td>\n",
+       "      <td>0.807750</td>\n",
+       "      <td>0.818731</td>\n",
+       "      <td>0.813203</td>\n",
+       "      <td>0.960332</td>\n",
+       "      <td>{}</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4</td>\n",
+       "      <td>0.027200</td>\n",
+       "      <td>0.174292</td>\n",
+       "      <td>0.765292</td>\n",
+       "      <td>0.812689</td>\n",
+       "      <td>0.788278</td>\n",
+       "      <td>0.954747</td>\n",
+       "      <td>{}</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table><p>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval_per_entity\" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.\n",
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval/per_entity\" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.\n",
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval_per_entity\" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.\n",
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval/per_entity\" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.\n",
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval_per_entity\" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.\n",
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval/per_entity\" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.\n",
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval_per_entity\" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.\n",
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval/per_entity\" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Mengevaluasi model pada data test...\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "      \n",
+       "      <progress value='14' max='14' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [14/14 00:05]\n",
+       "    </div>\n",
+       "    "
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval_per_entity\" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.\n",
+      "Trainer is attempting to log a value of \"{}\" of type <class 'dict'> for key \"eval/per_entity\" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Hasil Evaluasi:\n",
+      "Precision: 0.7528\n",
+      "Recall: 0.7878\n",
+      "F1-Score: 0.7699\n",
+      "Accuracy: 0.9497\n",
+      "\n",
+      "Metrik per Entitas:\n",
+      "\n",
+      "Model dan tokenizer telah disimpan ke './ner_model'\n",
+      "\n",
+      "Contoh Prediksi pada Data Test (5 Sampel):\n",
+      "\n",
+      "Sampel 1:\n",
+      "Tokens: [CLS] joe ##tat ##a hadi ##hard ##aja dan dihadiri oleh rektor undip prof . [SEP]\n",
+      "True Labels: ['B-PERSON', 'I-PERSON', 'O', 'O', 'O', 'O', 'B-ORGANISATION', 'O', 'O']\n",
+      "Predicted Labels: ['B-PERSON', 'I-PERSON', 'O', 'O', 'O', 'O', 'B-PLACE', 'O', 'O']\n",
+      "\n",
+      "Sampel 2:\n",
+      "Tokens: [CLS] sejak masih duduk di bangku sekolah tk kevin sudah belajar alat musik piano secara formal dan ketika ia menginjak sekolah smp pemilik nama asli kevin april ##io sum ##aat ##maj ##a ini , mulai belajar menulis lagu sendiri . [SEP]\n",
+      "True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PERSON', 'I-PERSON', 'I-PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']\n",
+      "Predicted Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PERSON', 'I-PERSON', 'I-PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']\n",
+      "\n",
+      "Sampel 3:\n",
+      "Tokens: [CLS] pada tanggal 6 februari 1976 , wakil ketua lock ##he ##ed corporation memberitahu subk ##omi ##te senat as bahwa tana ##ka selaku pm telah dibayar ( dis ##ogo ##k ) sebagai ganjaran pembelian pesawat lock ##he ##ed l - 1011 . [SEP]\n",
+      "True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORGANISATION', 'I-ORGANISATION', 'O', 'O', 'O', 'B-PLACE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORGANISATION', 'O', 'O', 'O', 'O']\n",
+      "Predicted Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORGANISATION', 'I-ORGANISATION', 'O', 'O', 'O', 'B-PLACE', 'O', 'B-PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORGANISATION', 'O', 'O', 'O', 'O']\n",
+      "\n",
+      "Sampel 4:\n",
+      "Tokens: [CLS] dengan kondisi alam yang sejuk dan curah hujan yang tinggi maka didaerah tersebut banyak didapati bermacam jenis flora dan fauna seperti : gajah yang di kenal dengan legenda poc ##ut me ##urah ##nya , rusa , harimau , beruang , kancil , babi hutan , tengg ##iling , landak dan ular , juga terdapat berbagai macam jenis burung yang selalu menghiasi kawasan ini . [SEP]\n",
+      "True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']\n",
+      "Predicted Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']\n",
+      "\n",
+      "Sampel 5:\n",
+      "Tokens: [CLS] awak pesawat yang terdiri atas pilot ard ##y ted ##jo , kopi ##lot h ribuan dan dua awak lainnya perry reh ##ata dan mei ##nas ##ta segera membuka pintu pesawat dan menurunkan penumpang dengan selamat . tanggal 14 juni 2009 , hari minggu , pukul 09 . 20 , pesawat terbang express air jenis dor ##nie ##r d ##32 ##8 - 100 bernomor badan pk - tx ##n , mengalami kecelakaan saat mendarat . [SEP]\n",
+      "True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-PERSON', 'I-PERSON', 'O', 'O', 'B-PERSON', 'I-PERSON', 'O', 'O', 'O', 'O', 'B-PERSON', 'I-PERSON', 'O', 'B-PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORGANISATION', 'I-ORGANISATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']\n",
+      "Predicted Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-PERSON', 'I-PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PERSON', 'I-PERSON', 'O', 'B-PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']\n",
+      "\n",
+      "Analisis Pola Error (Tanggal diprediksi sebagai Lokasi):\n",
+      "Tidak ditemukan contoh tanggal yang diprediksi sebagai lokasi dalam 100 sampel.\n",
+      "\n",
+      "Pertimbangan Keamanan Data, Privasi, dan Etika:\n",
+      "- Dataset bersumber dari berita publik, tidak mengandung informasi sensitif seperti alamat atau nomor identitas.\n",
+      "- Nama orang dalam dataset berasal dari media publik, aman untuk digunakan.\n",
+      "- Dataset mencakup berbagai topik berita, mengurangi risiko bias terhadap entitas tertentu.\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import sys\n",
+    "import subprocess\n",
+    "import numpy as np\n",
+    "from datasets import load_dataset\n",
+    "from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification, Trainer, TrainingArguments\n",
+    "import evaluate\n",
+    "import torch\n",
+    "import optuna\n",
+    "\n",
+    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+    "print(f\"Menggunakan perangkat: {device}\")\n",
+    "\n",
+    "# Load dataset\n",
+    "try:\n",
+    "    dataset = load_dataset(\"indonlp/indonlu\", \"nergrit\", trust_remote_code=True)\n",
+    "except Exception as e:\n",
+    "    print(f\"Gagal memuat dataset: {e}\")\n",
+    "    sys.exit(1)\n",
+    "\n",
+    "# Verify dataset structure\n",
+    "if \"train\" not in dataset or \"validation\" not in dataset or \"test\" not in dataset:\n",
+    "    print(\"Dataset tidak memiliki split train/validation/test yang diharapkan.\")\n",
+    "    sys.exit(1)\n",
+    "if \"tokens\" not in dataset[\"train\"].column_names or \"ner_tags\" not in dataset[\"train\"].column_names:\n",
+    "    print(\"Dataset tidak memiliki kolom 'tokens' atau 'ner_tags'.\")\n",
+    "    sys.exit(1)\n",
+    "\n",
+    "# Define label list\n",
+    "try:\n",
+    "    label_list = dataset[\"train\"].features[\"ner_tags\"].feature.names\n",
+    "    label2id = {label: i for i, label in enumerate(label_list)}\n",
+    "    id2label = {i: label for i, label in enumerate(label_list)}\n",
+    "except Exception as e:\n",
+    "    print(f\"Gagal mendapatkan label: {e}\")\n",
+    "    sys.exit(1)\n",
+    "\n",
+    "# Load tokenizer\n",
+    "try:\n",
+    "    tokenizer = AutoTokenizer.from_pretrained(\"indobenchmark/indobert-base-p1\")\n",
+    "except Exception as e:\n",
+    "    print(f\"Gagal memuat tokenizer: {e}\")\n",
+    "    sys.exit(1)\n",
+    "\n",
+    "# Tokenize and align labels\n",
+    "def tokenize_and_align_labels(examples):\n",
+    "    tokenized_inputs = tokenizer(examples[\"tokens\"], truncation=True, is_split_into_words=True)\n",
+    "    labels = []\n",
+    "    for i, label in enumerate(examples[\"ner_tags\"]):\n",
+    "        word_ids = tokenized_inputs.word_ids(batch_index=i)\n",
+    "        previous_word_idx = None\n",
+    "        label_ids = []\n",
+    "        for word_idx in word_ids:\n",
+    "            if word_idx is None:\n",
+    "                label_ids.append(-100)\n",
+    "            elif word_idx != previous_word_idx:\n",
+    "                label_ids.append(label[word_idx])\n",
+    "            else:\n",
+    "                label_ids.append(-100)\n",
+    "            previous_word_idx = word_idx\n",
+    "        labels.append(label_ids)\n",
+    "    tokenized_inputs[\"labels\"] = labels\n",
+    "    return tokenized_inputs\n",
+    "\n",
+    "# Tokenize dataset\n",
+    "try:\n",
+    "    tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)\n",
+    "except Exception as e:\n",
+    "    print(f\"Gagal menokenisasi dataset: {e}\")\n",
+    "    sys.exit(1)\n",
+    "\n",
+    "# Data collator\n",
+    "data_collator = DataCollatorForTokenClassification(tokenizer)\n",
+    "\n",
+    "# Load evaluation metric\n",
+    "metric = evaluate.load(\"seqeval\")\n",
+    "\n",
+    "# Compute metrics\n",
+    "def compute_metrics(p):\n",
+    "    predictions, labels = p\n",
+    "    predictions = np.argmax(predictions, axis=2)\n",
+    "    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]\n",
+    "    pred_labels = [[id2label[p] for p, l in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]\n",
+    "    results = metric.compute(predictions=pred_labels, references=true_labels)\n",
+    "    per_entity = {}\n",
+    "    for entity in [\"PERSON\", \"ORGANISATION\", \"PLACE\", \"DATE\"]:\n",
+    "        if entity.lower() in results:\n",
+    "            per_entity[entity] = {\n",
+    "                \"precision\": results[entity.lower()][\"precision\"],\n",
+    "                \"recall\": results[entity.lower()][\"recall\"],\n",
+    "                \"f1\": results[entity.lower()][\"f1\"],\n",
+    "            }\n",
+    "    return {\n",
+    "        \"precision\": results[\"overall_precision\"],\n",
+    "        \"recall\": results[\"overall_recall\"],\n",
+    "        \"f1\": results[\"overall_f1\"],\n",
+    "        \"accuracy\": results[\"overall_accuracy\"],\n",
+    "        \"per_entity\": per_entity,\n",
+    "    }\n",
+    "\n",
+    "# Define objective function for Optuna\n",
+    "def objective(trial):\n",
+    "    # Define hyperparameter search space\n",
+    "    learning_rate = trial.suggest_float(\"learning_rate\", 1e-5, 5e-5, log=True)\n",
+    "    batch_size = trial.suggest_categorical(\"batch_size\", [8, 16, 32])\n",
+    "    num_epochs = trial.suggest_int(\"num_epochs\", 3, 5)\n",
+    "\n",
+    "    # Load model for each trial\n",
+    "    model = AutoModelForTokenClassification.from_pretrained(\n",
+    "        \"indobenchmark/indobert-base-p1\",\n",
+    "        num_labels=len(label_list),\n",
+    "        id2label=id2label,\n",
+    "        label2id=label2id\n",
+    "    )\n",
+    "    model.to(device)\n",
+    "\n",
+    "    # Set training arguments\n",
+    "    training_args = TrainingArguments(\n",
+    "        output_dir=f\"./results_trial_{trial.number}\",\n",
+    "        eval_strategy=\"epoch\",\n",
+    "        learning_rate=learning_rate,\n",
+    "        per_device_train_batch_size=batch_size,\n",
+    "        per_device_eval_batch_size=batch_size,\n",
+    "        num_train_epochs=num_epochs,\n",
+    "        weight_decay=0.01,\n",
+    "        logging_dir=f\"./logs_trial_{trial.number}\",\n",
+    "        logging_steps=10,\n",
+    "        save_strategy=\"epoch\",\n",
+    "        load_best_model_at_end=True,\n",
+    "        metric_for_best_model=\"f1\",\n",
+    "    )\n",
+    "\n",
+    "    # Initialize Trainer\n",
+    "    trainer = Trainer(\n",
+    "        model=model,\n",
+    "        args=training_args,\n",
+    "        train_dataset=tokenized_dataset[\"train\"],\n",
+    "        eval_dataset=tokenized_dataset[\"validation\"],\n",
+    "        tokenizer=tokenizer,\n",
+    "        data_collator=data_collator,\n",
+    "        compute_metrics=compute_metrics,\n",
+    "    )\n",
+    "\n",
+    "    # Train and evaluate\n",
+    "    trainer.train()\n",
+    "    eval_results = trainer.evaluate()\n",
+    "    return eval_results[\"eval_f1\"]\n",
+    "\n",
+    "# Run Optuna optimization\n",
+    "print(\"Memulai hyperparameter tuning dengan Optuna...\")\n",
+    "study = optuna.create_study(direction=\"maximize\")\n",
+    "study.optimize(objective, n_trials=5)  # Adjust n_trials as needed\n",
+    "print(\"\\nHyperparameter terbaik:\")\n",
+    "print(study.best_params)\n",
+    "print(f\"F1-Score terbaik: {study.best_value:.4f}\")\n",
+    "\n",
+    "# Train final model with best hyperparameters\n",
+    "best_params = study.best_params\n",
+    "model = AutoModelForTokenClassification.from_pretrained(\n",
+    "    \"indobenchmark/indobert-base-p1\",\n",
+    "    num_labels=len(label_list),\n",
+    "    id2label=id2label,\n",
+    "    label2id=label2id\n",
+    ")\n",
+    "model.to(device)\n",
+    "\n",
+    "training_args = TrainingArguments(\n",
+    "    output_dir=\"./results\",\n",
+    "    eval_strategy=\"epoch\",\n",
+    "    learning_rate=best_params[\"learning_rate\"],\n",
+    "    per_device_train_batch_size=best_params[\"batch_size\"],\n",
+    "    per_device_eval_batch_size=best_params[\"batch_size\"],\n",
+    "    num_train_epochs=best_params[\"num_epochs\"],\n",
+    "    weight_decay=0.01,\n",
+    "    logging_dir=\"./logs\",\n",
+    "    logging_steps=10,\n",
+    "    save_strategy=\"epoch\",\n",
+    "    load_best_model_at_end=True,\n",
+    "    metric_for_best_model=\"f1\",\n",
+    ")\n",
+    "\n",
+    "trainer = Trainer(\n",
+    "    model=model,\n",
+    "    args=training_args,\n",
+    "    train_dataset=tokenized_dataset[\"train\"],\n",
+    "    eval_dataset=tokenized_dataset[\"validation\"],\n",
+    "    tokenizer=tokenizer,\n",
+    "    data_collator=data_collator,\n",
+    "    compute_metrics=compute_metrics,\n",
+    ")\n",
+    "\n",
+    "# Train the model\n",
+    "print(\"\\nMemulai pelatihan dengan hyperparameter terbaik...\")\n",
+    "try:\n",
+    "    trainer.train()\n",
+    "except Exception as e:\n",
+    "    print(f\"Gagal melatih model: {e}\")\n",
+    "    sys.exit(1)\n",
+    "\n",
+    "# Evaluate on test set\n",
+    "print(\"\\nMengevaluasi model pada data test...\")\n",
+    "try:\n",
+    "    results = trainer.evaluate(tokenized_dataset[\"test\"])\n",
+    "except Exception as e:\n",
+    "    print(f\"Gagal mengevaluasi model: {e}\")\n",
+    "    sys.exit(1)\n",
+    "\n",
+    "# Print evaluation results\n",
+    "print(\"\\nHasil Evaluasi:\")\n",
+    "print(f\"Precision: {results['eval_precision']:.4f}\")\n",
+    "print(f\"Recall: {results['eval_recall']:.4f}\")\n",
+    "print(f\"F1-Score: {results['eval_f1']:.4f}\")\n",
+    "print(f\"Accuracy: {results['eval_accuracy']:.4f}\")\n",
+    "print(\"\\nMetrik per Entitas:\")\n",
+    "for entity, metrics in results.get(\"eval_per_entity\", {}).items():\n",
+    "    print(f\"{entity}:\")\n",
+    "    print(f\"  Precision: {metrics['precision']:.4f}\")\n",
+    "    print(f\"  Recall: {metrics['recall']:.4f}\")\n",
+    "    print(f\"  F1-Score: {metrics['f1']:.4f}\")\n",
+    "\n",
+    "# Save the model\n",
+    "try:\n",
+    "    model.save_pretrained(\"./ner_model\")\n",
+    "    tokenizer.save_pretrained(\"./ner_model\")\n",
+    "    print(\"\\nModel dan tokenizer telah disimpan ke './ner_model'\")\n",
+    "except Exception as e:\n",
+    "    print(f\"Gagal menyimpan model: {e}\")\n",
+    "    sys.exit(1)\n",
+    "\n",
+    "# Example inference on test samples\n",
+    "print(\"\\nContoh Prediksi pada Data Test (5 Sampel):\")\n",
+    "try:\n",
+    "    for i in range(min(5, len(tokenized_dataset[\"test\"]))):\n",
+    "        sample = tokenized_dataset[\"test\"][i]\n",
+    "        input_ids = torch.tensor([sample[\"input_ids\"]], device=device)\n",
+    "        attention_mask = torch.tensor([sample[\"attention_mask\"]], device=device)\n",
+    "        model.eval()\n",
+    "        with torch.no_grad():\n",
+    "            outputs = model(input_ids, attention_mask=attention_mask)\n",
+    "        predictions = outputs.logits.argmax(dim=2)[0].cpu().numpy()\n",
+    "        tokens = tokenizer.convert_ids_to_tokens(sample[\"input_ids\"])\n",
+    "        labels = [id2label[pred] for pred, label in zip(predictions, sample[\"labels\"]) if label != -100]\n",
+    "        true_labels = [id2label[label] for label in sample[\"labels\"] if label != -100]\n",
+    "        print(f\"\\nSampel {i+1}:\")\n",
+    "        print(f\"Tokens: {' '.join(tokens)}\")\n",
+    "        print(f\"True Labels: {true_labels}\")\n",
+    "        print(f\"Predicted Labels: {labels}\")\n",
+    "except Exception as e:\n",
+    "    print(f\"Gagal melakukan inferensi: {e}\")\n",
+    "    sys.exit(1)\n",
+    "\n",
+    "# Analyze error patterns (DATE predicted as LOC)\n",
+    "print(\"\\nAnalisis Pola Error (Tanggal diprediksi sebagai Lokasi):\")\n",
+    "found_error = False\n",
+    "for i in range(min(100, len(tokenized_dataset[\"test\"]))):\n",
+    "    sample = tokenized_dataset[\"test\"][i]\n",
+    "    input_ids = torch.tensor([sample[\"input_ids\"]], device=device)\n",
+    "    attention_mask = torch.tensor([sample[\"attention_mask\"]], device=device)\n",
+    "    with torch.no_grad():\n",
+    "        outputs = model(input_ids, attention_mask=attention_mask)\n",
+    "    predictions = outputs.logits.argmax(dim=2)[0].cpu().numpy()\n",
+    "    true_labels = [id2label[label] for label in sample[\"labels\"] if label != -100]\n",
+    "    pred_labels = [id2label[pred] for pred, label in zip(predictions, sample[\"labels\"]) if label != -100]\n",
+    "    for j, (true, pred) in enumerate(zip(true_labels, pred_labels)):\n",
+    "        if true.startswith(\"B-DATE\") and pred.startswith(\"B-LOC\"):\n",
+    "            tokens = tokenizer.convert_ids_to_tokens(sample[\"input_ids\"])\n",
+    "            print(f\"\\nSampel dengan Error (DATE diprediksi sebagai LOC):\")\n",
+    "            print(f\"Tokens: {' '.join(tokens)}\")\n",
+    "            print(f\"True Labels: {true_labels}\")\n",
+    "            print(f\"Predicted Labels: {pred_labels}\")\n",
+    "            found_error = True\n",
+    "            break\n",
+    "    if found_error:\n",
+    "        break\n",
+    "if not found_error:\n",
+    "    print(\"Tidak ditemukan contoh tanggal yang diprediksi sebagai lokasi dalam 100 sampel.\")\n",
+    "\n",
+    "# Data Security, Privacy, and Ethics\n",
+    "print(\"\\nPertimbangan Keamanan Data, Privasi, dan Etika:\")\n",
+    "print(\"- Dataset bersumber dari berita publik, tidak mengandung informasi sensitif seperti alamat atau nomor identitas.\")\n",
+    "print(\"- Nama orang dalam dataset berasal dari media publik, aman untuk digunakan.\")\n",
+    "print(\"- Dataset mencakup berbagai topik berita, mengurangi risiko bias terhadap entitas tertentu.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "714cfb72",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "93508875",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

ner_model/config.json ADDED Viewed

	@@ -0,0 +1,50 @@

+{
+  "_num_labels": 5,
+  "architectures": [
+    "BertForTokenClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "directionality": "bidi",
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "id2label": {
+    "0": "I-PERSON",
+    "1": "B-ORGANISATION",
+    "2": "I-ORGANISATION",
+    "3": "B-PLACE",
+    "4": "I-PLACE",
+    "5": "O",
+    "6": "B-PERSON"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "label2id": {
+    "B-ORGANISATION": 1,
+    "B-PERSON": 6,
+    "B-PLACE": 3,
+    "I-ORGANISATION": 2,
+    "I-PERSON": 0,
+    "I-PLACE": 4,
+    "O": 5
+  },
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "output_past": true,
+  "pad_token_id": 0,
+  "pooler_fc_size": 768,
+  "pooler_num_attention_heads": 12,
+  "pooler_num_fc_layers": 3,
+  "pooler_size_per_head": 128,
+  "pooler_type": "first_token_transform",
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.53.1",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 50000
+}

ner_model/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:37b28df302c0498855d30b937557b567a7be050c81501056a493828385199064
+size 495447892

ner_model/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

ner_model/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

ner_model/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": true,
+  "extra_special_tokens": {},
+  "mask_token": "[MASK]",
+  "model_max_length": 1000000000000000019884624838656,
+  "never_split": null,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

ner_model/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+transformers==4.44.2
+datasets==2.21.0
+torch==2.4.1
+gradio==4.44.0
+pandas==2.2.2
+numpy==1.26.4