{ "cells": [ { "cell_type": "markdown", "id": "d10bfa50537af75f", "metadata": {}, "source": [ "## Experiment exp027-2\n", "xlm-roberta-large, Batch Size: 32, Learning Rate: 2e-5, Warmup Steps: 500" ] }, { "cell_type": "code", "execution_count": 51, "id": "9748a35a024779ae", "metadata": { "ExecuteTime": { "end_time": "2025-06-27T22:06:52.194727Z", "start_time": "2025-06-27T22:06:52.191088Z" } }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "from sklearn.model_selection import train_test_split\n", "from transformers import (\n", " AutoTokenizer,\n", " BertForTokenClassification,\n", " AutoModelForTokenClassification\n", ")\n", "import torch\n", "import os\n", "\n", "os.environ[\"CUDA_DEVICE_ORDER\"] = \"PCI_BUS_ID\"\n", "os.environ[\"CUDA_VISIBLE_DEVICES\"] = '1'" ] }, { "cell_type": "code", "execution_count": 56, "id": "4ae3d9e4c556a288", "metadata": { "ExecuteTime": { "end_time": "2025-06-27T22:07:26.334867Z", "start_time": "2025-06-27T22:07:26.325629Z" } }, "outputs": [], "source": [ "test_comments_spans = pd.read_csv(\"./submissions/task2-predicted.csv\")" ] }, { "cell_type": "code", "execution_count": 57, "id": "156c9b1c48a954b4", "metadata": { "ExecuteTime": { "end_time": "2025-06-27T22:07:30.302897Z", "start_time": "2025-06-27T22:07:30.290021Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
documentcomment_idtypestartend
0NDY-0042compliment021
1NDY-0044affection declaration019
2NDY-0045affection declaration025
3NDY-0045affection declaration2656
4NDY-0045positive feedback5771
..................
5498NDY-203526affection declaration017
5499NDY-203526positive feedback3059
5500NDY-203526positive feedback64104
5501NDY-203526affection declaration105106
5502NDY-203526affection declaration105114
\n", "

5503 rows × 5 columns

\n", "
" ], "text/plain": [ " document comment_id type start end\n", "0 NDY-004 2 compliment 0 21\n", "1 NDY-004 4 affection declaration 0 19\n", "2 NDY-004 5 affection declaration 0 25\n", "3 NDY-004 5 affection declaration 26 56\n", "4 NDY-004 5 positive feedback 57 71\n", "... ... ... ... ... ...\n", "5498 NDY-203 526 affection declaration 0 17\n", "5499 NDY-203 526 positive feedback 30 59\n", "5500 NDY-203 526 positive feedback 64 104\n", "5501 NDY-203 526 affection declaration 105 106\n", "5502 NDY-203 526 affection declaration 105 114\n", "\n", "[5503 rows x 5 columns]" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_comments_spans" ] }, { "cell_type": "code", "execution_count": 58, "id": "2b63b3b12b9648f6", "metadata": { "ExecuteTime": { "end_time": "2025-06-27T22:07:50.819958Z", "start_time": "2025-06-27T22:07:50.699928Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
documentcomment_idcommentpredicted_labelspredicted_probsoffset_mappingtext_tokenspredicted_spans
0NDY-0041Lol i love lochis[0, 0, 0, 0, 0, 0, 0, 0][[0.99999654, 1.7456429e-07, 1.6115715e-07, 1....[[0, 0], [0, 1], [1, 3], [4, 5], [6, 10], [11,...[▁L, ol, ▁i, ▁love, ▁loc, his][]
1NDY-0042ihr singt voll gut :)[0, 2, 12, 12, 12, 12, 12, 0][[0.9999976, 1.1218729e-07, 1.239344e-07, 1.50...[[0, 0], [0, 3], [4, 8], [8, 9], [10, 14], [15...[▁ihr, ▁sing, t, ▁voll, ▁gut, ▁:)][{'type': 'compliment', 'start': 0, 'end': 21,...
2NDY-0043Junge fick dich[0, 0, 0, 0, 0, 0][[0.9999981, 5.8623616e-08, 1.05891374e-07, 1....[[0, 0], [0, 4], [4, 5], [6, 10], [11, 15], [0...[▁Jung, e, ▁fick, ▁dich][]
3NDY-0044Ihr seit die besten[0, 3, 13, 13, 13, 0][[0.99999774, 1.6417343e-07, 1.384722e-07, 1.1...[[0, 0], [0, 3], [4, 8], [9, 12], [13, 19], [0...[▁Ihr, ▁seit, ▁die, ▁besten][{'type': 'affection declaration', 'start': 0,...
4NDY-0045ihr seit die ALLER besten ich finde euch soooo...[0, 3, 13, 13, 13, 13, 13, 3, 13, 13, 13, 13, ...[[0.99999785, 1.2960982e-07, 1.4320104e-07, 1....[[0, 0], [0, 3], [4, 8], [9, 12], [13, 17], [1...[▁ihr, ▁seit, ▁die, ▁ALLE, R, ▁besten, ▁ich, ▁...[{'type': 'affection declaration', 'start': 0,...
...........................
9224NDY-203522hihi kannst du mich grüßen 💕 👋 😍 Achso wusstes...[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 11, 0, 11, 11, ...[[0.99999774, 1.8107521e-07, 1.0220851e-07, 9....[[0, 0], [0, 4], [5, 11], [12, 14], [15, 19], ...[▁hihi, ▁kannst, ▁du, ▁mich, ▁gr, üß, en, ▁, 💕...[{'type': 'positive feedback', 'start': 27, 'e...
9225NDY-203523#Glocke aktiviert 👑 Ich liebe deine Videos 💍 💎...[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 11, 11, 11, 11,...[[0.9999976, 1.1908668e-07, 8.492378e-08, 6.60...[[0, 0], [0, 1], [1, 2], [2, 6], [6, 7], [8, 1...[▁#, G, lock, e, ▁aktiv, iert, ▁, 👑, ▁Ich, ▁li...[{'type': 'positive feedback', 'start': 20, 'e...
9226NDY-203524Bist die beste ❤ Bitte Grüße mich 💕 ❤ 😘 😍[0, 3, 13, 13, 13, 13, 0, 0, 0, 1, 1, 11, 11, ...[[0.9999974, 2.1362885e-07, 1.2580301e-07, 9.5...[[0, 0], [0, 3], [3, 4], [5, 8], [9, 14], [15,...[▁Bis, t, ▁die, ▁beste, ▁❤, ▁Bitte, ▁Grüße, ▁m...[{'type': 'affection declaration', 'start': 0,...
9227NDY-203525Hi Bonny ❤️ War letztens auf'm Flughafen , und...[0, 0, 0, 0, 1, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0,...[[0.99999523, 6.63842e-07, 2.0147786e-07, 1.16...[[0, 0], [0, 2], [3, 6], [6, 8], [9, 10], [10,...[▁Hi, ▁Bon, ny, ▁❤, ️, ▁War, ▁letzten, s, ▁auf...[{'type': 'positive feedback', 'start': 9, 'en...
9228NDY-203526du bist die beste ich bin neu ich hab dich sof...[0, 3, 13, 13, 13, 0, 0, 0, 1, 11, 11, 11, 11,...[[0.999997, 3.4811254e-07, 7.750037e-08, 7.272...[[0, 0], [0, 2], [3, 7], [8, 11], [12, 17], [1...[▁du, ▁bist, ▁die, ▁beste, ▁ich, ▁bin, ▁neu, ▁...[{'type': 'affection declaration', 'start': 0,...
\n", "

9229 rows × 8 columns

\n", "
" ], "text/plain": [ " document comment_id comment \\\n", "0 NDY-004 1 Lol i love lochis \n", "1 NDY-004 2 ihr singt voll gut :) \n", "2 NDY-004 3 Junge fick dich \n", "3 NDY-004 4 Ihr seit die besten \n", "4 NDY-004 5 ihr seit die ALLER besten ich finde euch soooo... \n", "... ... ... ... \n", "9224 NDY-203 522 hihi kannst du mich grüßen 💕 👋 😍 Achso wusstes... \n", "9225 NDY-203 523 #Glocke aktiviert 👑 Ich liebe deine Videos 💍 💎... \n", "9226 NDY-203 524 Bist die beste ❤ Bitte Grüße mich 💕 ❤ 😘 😍 \n", "9227 NDY-203 525 Hi Bonny ❤️ War letztens auf'm Flughafen , und... \n", "9228 NDY-203 526 du bist die beste ich bin neu ich hab dich sof... \n", "\n", " predicted_labels \\\n", "0 [0, 0, 0, 0, 0, 0, 0, 0] \n", "1 [0, 2, 12, 12, 12, 12, 12, 0] \n", "2 [0, 0, 0, 0, 0, 0] \n", "3 [0, 3, 13, 13, 13, 0] \n", "4 [0, 3, 13, 13, 13, 13, 13, 3, 13, 13, 13, 13, ... \n", "... ... \n", "9224 [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 11, 0, 11, 11, ... \n", "9225 [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 11, 11, 11, 11,... \n", "9226 [0, 3, 13, 13, 13, 13, 0, 0, 0, 1, 1, 11, 11, ... \n", "9227 [0, 0, 0, 0, 1, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0,... \n", "9228 [0, 3, 13, 13, 13, 0, 0, 0, 1, 11, 11, 11, 11,... \n", "\n", " predicted_probs \\\n", "0 [[0.99999654, 1.7456429e-07, 1.6115715e-07, 1.... \n", "1 [[0.9999976, 1.1218729e-07, 1.239344e-07, 1.50... \n", "2 [[0.9999981, 5.8623616e-08, 1.05891374e-07, 1.... \n", "3 [[0.99999774, 1.6417343e-07, 1.384722e-07, 1.1... \n", "4 [[0.99999785, 1.2960982e-07, 1.4320104e-07, 1.... \n", "... ... \n", "9224 [[0.99999774, 1.8107521e-07, 1.0220851e-07, 9.... \n", "9225 [[0.9999976, 1.1908668e-07, 8.492378e-08, 6.60... \n", "9226 [[0.9999974, 2.1362885e-07, 1.2580301e-07, 9.5... \n", "9227 [[0.99999523, 6.63842e-07, 2.0147786e-07, 1.16... \n", "9228 [[0.999997, 3.4811254e-07, 7.750037e-08, 7.272... \n", "\n", " offset_mapping \\\n", "0 [[0, 0], [0, 1], [1, 3], [4, 5], [6, 10], [11,... \n", "1 [[0, 0], [0, 3], [4, 8], [8, 9], [10, 14], [15... \n", "2 [[0, 0], [0, 4], [4, 5], [6, 10], [11, 15], [0... \n", "3 [[0, 0], [0, 3], [4, 8], [9, 12], [13, 19], [0... \n", "4 [[0, 0], [0, 3], [4, 8], [9, 12], [13, 17], [1... \n", "... ... \n", "9224 [[0, 0], [0, 4], [5, 11], [12, 14], [15, 19], ... \n", "9225 [[0, 0], [0, 1], [1, 2], [2, 6], [6, 7], [8, 1... \n", "9226 [[0, 0], [0, 3], [3, 4], [5, 8], [9, 14], [15,... \n", "9227 [[0, 0], [0, 2], [3, 6], [6, 8], [9, 10], [10,... \n", "9228 [[0, 0], [0, 2], [3, 7], [8, 11], [12, 17], [1... \n", "\n", " text_tokens \\\n", "0 [▁L, ol, ▁i, ▁love, ▁loc, his] \n", "1 [▁ihr, ▁sing, t, ▁voll, ▁gut, ▁:)] \n", "2 [▁Jung, e, ▁fick, ▁dich] \n", "3 [▁Ihr, ▁seit, ▁die, ▁besten] \n", "4 [▁ihr, ▁seit, ▁die, ▁ALLE, R, ▁besten, ▁ich, ▁... \n", "... ... \n", "9224 [▁hihi, ▁kannst, ▁du, ▁mich, ▁gr, üß, en, ▁, 💕... \n", "9225 [▁#, G, lock, e, ▁aktiv, iert, ▁, 👑, ▁Ich, ▁li... \n", "9226 [▁Bis, t, ▁die, ▁beste, ▁❤, ▁Bitte, ▁Grüße, ▁m... \n", "9227 [▁Hi, ▁Bon, ny, ▁❤, ️, ▁War, ▁letzten, s, ▁auf... \n", "9228 [▁du, ▁bist, ▁die, ▁beste, ▁ich, ▁bin, ▁neu, ▁... \n", "\n", " predicted_spans \n", "0 [] \n", "1 [{'type': 'compliment', 'start': 0, 'end': 21,... \n", "2 [] \n", "3 [{'type': 'affection declaration', 'start': 0,... \n", "4 [{'type': 'affection declaration', 'start': 0,... \n", "... ... \n", "9224 [{'type': 'positive feedback', 'start': 27, 'e... \n", "9225 [{'type': 'positive feedback', 'start': 20, 'e... \n", "9226 [{'type': 'affection declaration', 'start': 0,... \n", "9227 [{'type': 'positive feedback', 'start': 9, 'en... \n", "9228 [{'type': 'affection declaration', 'start': 0,... \n", "\n", "[9229 rows x 8 columns]" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_comments" ] }, { "cell_type": "code", "execution_count": 60, "id": "263a51fec4f4672", "metadata": { "ExecuteTime": { "end_time": "2025-06-27T22:09:58.052637Z", "start_time": "2025-06-27T22:09:57.997729Z" } }, "outputs": [], "source": [ "test_comments['has_spans'] = test_comments.apply(lambda x: len(x['predicted_spans']) > 0, axis=1)" ] }, { "cell_type": "code", "execution_count": 63, "id": "5fa67bbeb303ca3a", "metadata": { "ExecuteTime": { "end_time": "2025-06-27T22:10:35.264094Z", "start_time": "2025-06-27T22:10:35.260301Z" } }, "outputs": [], "source": [ "test_comments['flausch'] = test_comments['has_spans'].map({True: 'yes', False: 'no'})" ] }, { "cell_type": "code", "execution_count": 66, "id": "fd7679e665286b70", "metadata": { "ExecuteTime": { "end_time": "2025-06-27T22:11:57.164479Z", "start_time": "2025-06-27T22:11:57.150708Z" } }, "outputs": [], "source": [ "test_comments[[\"document\",\"comment_id\",\"flausch\"]].to_csv(f'./submissions/task1-predicted.csv', index=False)" ] }, { "cell_type": "code", "execution_count": 68, "id": "bd9d8b153b8d27ed", "metadata": { "ExecuteTime": { "end_time": "2025-06-27T22:12:25.303426Z", "start_time": "2025-06-27T22:12:24.850361Z" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] } ], "source": [ "!cp './submissions/task1-predicted.csv' './submissions/subtask1_submission2.csv'" ] }, { "cell_type": "code", "execution_count": 70, "id": "5a2738b19dcd4292", "metadata": { "ExecuteTime": { "end_time": "2025-06-27T22:12:43.388207Z", "start_time": "2025-06-27T22:12:42.945847Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "document,comment_id,flausch\r\n", "NDY-004,1,no\r\n", "NDY-004,2,yes\r\n", "NDY-004,3,no\r\n", "NDY-004,4,yes\r\n", "NDY-004,5,yes\r\n", "NDY-004,6,yes\r\n", "NDY-004,7,no\r\n", "NDY-004,8,yes\r\n", "NDY-004,9,no\r\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] } ], "source": [ "!head -n 10 './submissions/task1-predicted.csv'" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 5 }