Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -7,7 +7,23 @@ import torch
|
|
| 7 |
import numpy as np
|
| 8 |
from collections import Counter
|
| 9 |
import os
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
# Configure page
|
| 12 |
st.set_page_config(
|
| 13 |
page_title="Arabic Poem Analysis",
|
|
@@ -54,23 +70,6 @@ def split_text(text, max_length=512):
|
|
| 54 |
return chunks
|
| 55 |
|
| 56 |
def clean_arabic_text(text):
|
| 57 |
-
# Add Arabic stop words
|
| 58 |
-
ARABIC_STOP_WORDS = {
|
| 59 |
-
'ูู', 'ู
ู', 'ุฅูู', 'ุนูู', 'ุนู', 'ู
ุน', 'ุฎูุงู', 'ุญุชู', 'ุฅุฐุง', 'ุซู
',
|
| 60 |
-
'ุฃู', 'ู', 'ู', 'ู', 'ุจ', 'ู', 'ูู', 'ุงู', 'ูุฐุง', 'ูุฐู', 'ุฐูู',
|
| 61 |
-
'ุชูู', 'ูุคูุงุก', 'ูู
', 'ูู', 'ูู', 'ูู', 'ูุญู', 'ุงูุช', 'ุงูุชู
',
|
| 62 |
-
'ูุงู', 'ูุงูุช', 'ูููู', 'ุชููู', 'ุงู', 'ูู', 'ุจุนุถ', 'ุบูุฑ', 'ุญูู',
|
| 63 |
-
'ุนูุฏ', 'ูุฏ', 'ููุฏ', 'ูู
', 'ูู', 'ูู', 'ู
ุง', 'ู
ุงุฐุง', 'ู
ุชู', 'ููู',
|
| 64 |
-
'ุงูู', 'ูู
ุงุฐุง', 'ุงูุฐู', 'ุงูุชู', 'ุงูุฐูู', 'ุงููุงุชู', 'ุงูููุงุชู',
|
| 65 |
-
'ุงูุงู', 'ุจูู', 'ููู', 'ุชุญุช', 'ุงู
ุงู
', 'ุฎูู', 'ุญูู', 'ูุจู', 'ุจุนุฏ',
|
| 66 |
-
'ู', 'ุฃู', 'ูู', 'ูู', 'ูู
', 'ูู', 'ูู', 'ู
ู', 'ูู', 'ูู', 'ููุฉ',
|
| 67 |
-
'ูู
ุง', 'ููุง', 'ู
ูุฐ', 'ููุฏ', 'ููุง', 'ููุณ', 'ููู
', 'ุญูุซ', 'ููุงู',
|
| 68 |
-
'ุฌุฏุง', 'ุฐุงุช', 'ุถู
ู', 'ุงูู', 'ูุฏู', 'ุนููู', 'ู
ุซู', 'ููู', 'ุนูุฏ',
|
| 69 |
-
'ุฃู
ุง', 'ูุฐู', 'ูุฃู', 'ููู', 'ููุงู', 'ูุฏู', 'ููุงู', 'ููู', 'ููู',
|
| 70 |
-
'ููู', 'ุชูู', 'ููู
', 'ููู', 'ููู', 'ููู', 'ูููุฏ', 'ูู
ู', 'ููุฐุง',
|
| 71 |
-
'ุงูู', 'ุถู
ู', 'ุงููุง', 'ุฌู
ูุน', 'ุงูุฐู', 'ูุจู', 'ุจุนุฏ', 'ุญูู', 'ุงูุถุง',
|
| 72 |
-
'ูุงุฒู
', 'ุญุงุฌุฉ', 'ุนูู', 'ูุฌุจ', 'ุตุงุฑ', 'ุตุงุฑุช', 'ุชุญุช', 'ุถุฏ'
|
| 73 |
-
}
|
| 74 |
"""Clean Arabic text by removing stop words and normalizing."""
|
| 75 |
words = text.split()
|
| 76 |
cleaned_words = [word for word in words if word not in ARABIC_STOP_WORDS and len(word) > 1]
|
|
@@ -319,21 +318,21 @@ if uploaded_file is not None:
|
|
| 319 |
|
| 320 |
if topic_strategy == "Manual":
|
| 321 |
n_documents = len(df)
|
| 322 |
-
max_topics =
|
|
|
|
| 323 |
|
| 324 |
n_topics = st.slider(
|
| 325 |
"Number of Topics",
|
| 326 |
-
min_value=
|
| 327 |
max_value=max_topics,
|
| 328 |
-
value=
|
| 329 |
help=f"Select the desired number of topics (max {max_topics} based on dataset size)"
|
| 330 |
)
|
| 331 |
|
| 332 |
st.info(f"""
|
| 333 |
๐ก For your dataset of {n_documents:,} documents:
|
| 334 |
-
-
|
| 335 |
-
-
|
| 336 |
-
- Recommended range: {max(2, max_topics//5)}-{max_topics//2}
|
| 337 |
""")
|
| 338 |
|
| 339 |
with col2:
|
|
|
|
| 7 |
import numpy as np
|
| 8 |
from collections import Counter
|
| 9 |
import os
|
| 10 |
+
# Add Arabic stop words
|
| 11 |
+
ARABIC_STOP_WORDS = {
|
| 12 |
+
'ูู', 'ู
ู', 'ุฅูู', 'ุนูู', 'ุนู', 'ู
ุน', 'ุฎูุงู', 'ุญุชู', 'ุฅุฐุง', 'ุซู
',
|
| 13 |
+
'ุฃู', 'ู', 'ู', 'ู', 'ุจ', 'ู', 'ูู', 'ุงู', 'ูุฐุง', 'ูุฐู', 'ุฐูู',
|
| 14 |
+
'ุชูู', 'ูุคูุงุก', 'ูู
', 'ูู', 'ูู', 'ูู', 'ูุญู', 'ุงูุช', 'ุงูุชู
',
|
| 15 |
+
'ูุงู', 'ูุงูุช', 'ูููู', 'ุชููู', 'ุงู', 'ูู', 'ุจุนุถ', 'ุบูุฑ', 'ุญูู',
|
| 16 |
+
'ุนูุฏ', 'ูุฏ', 'ููุฏ', 'ูู
', 'ูู', 'ูู', 'ู
ุง', 'ู
ุงุฐุง', 'ู
ุชู', 'ููู',
|
| 17 |
+
'ุงูู', 'ูู
ุงุฐุง', 'ุงูุฐู', 'ุงูุชู', 'ุงูุฐูู', 'ุงููุงุชู', 'ุงูููุงุชู',
|
| 18 |
+
'ุงูุงู', 'ุจูู', 'ููู', 'ุชุญุช', 'ุงู
ุงู
', 'ุฎูู', 'ุญูู', 'ูุจู', 'ุจุนุฏ',
|
| 19 |
+
'ู', 'ุฃู', 'ูู', 'ูู', 'ูู
', 'ูู', 'ูู', 'ู
ู', 'ูู', 'ูู', 'ููุฉ',
|
| 20 |
+
'ูู
ุง', 'ููุง', 'ู
ูุฐ', 'ููุฏ', 'ููุง', 'ููุณ', 'ููู
', 'ุญูุซ', 'ููุงู',
|
| 21 |
+
'ุฌุฏุง', 'ุฐุงุช', 'ุถู
ู', 'ุงูู', 'ูุฏู', 'ุนููู', 'ู
ุซู', 'ููู', 'ุนูุฏ',
|
| 22 |
+
'ุฃู
ุง', 'ูุฐู', 'ูุฃู', 'ููู', 'ููุงู', 'ูุฏู', 'ููุงู', 'ููู', 'ููู',
|
| 23 |
+
'ููู', 'ุชูู', 'ููู
', 'ููู', 'ููู', 'ููู', 'ูููุฏ', 'ูู
ู', 'ููุฐุง',
|
| 24 |
+
'ุงูู', 'ุถู
ู', 'ุงููุง', 'ุฌู
ูุน', 'ุงูุฐู', 'ูุจู', 'ุจุนุฏ', 'ุญูู', 'ุงูุถุง',
|
| 25 |
+
'ูุงุฒู
', 'ุญุงุฌุฉ', 'ุนูู', 'ูุฌุจ', 'ุตุงุฑ', 'ุตุงุฑุช', 'ุชุญุช', 'ุถุฏ'
|
| 26 |
+
}
|
| 27 |
# Configure page
|
| 28 |
st.set_page_config(
|
| 29 |
page_title="Arabic Poem Analysis",
|
|
|
|
| 70 |
return chunks
|
| 71 |
|
| 72 |
def clean_arabic_text(text):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
"""Clean Arabic text by removing stop words and normalizing."""
|
| 74 |
words = text.split()
|
| 75 |
cleaned_words = [word for word in words if word not in ARABIC_STOP_WORDS and len(word) > 1]
|
|
|
|
| 318 |
|
| 319 |
if topic_strategy == "Manual":
|
| 320 |
n_documents = len(df)
|
| 321 |
+
max_topics = min(500, n_documents // 50)
|
| 322 |
+
min_topics = 5
|
| 323 |
|
| 324 |
n_topics = st.slider(
|
| 325 |
"Number of Topics",
|
| 326 |
+
min_value=min_topics,
|
| 327 |
max_value=max_topics,
|
| 328 |
+
value=default_topics,
|
| 329 |
help=f"Select the desired number of topics (max {max_topics} based on dataset size)"
|
| 330 |
)
|
| 331 |
|
| 332 |
st.info(f"""
|
| 333 |
๐ก For your dataset of {n_documents:,} documents:
|
| 334 |
+
- Available topic range: {min_topics}-{max_topics}
|
| 335 |
+
- Recommended range: {max_topics//10}-{max_topics//3} for optimal coherence
|
|
|
|
| 336 |
""")
|
| 337 |
|
| 338 |
with col2:
|