Norelad commited on
Commit
7e208b2
·
verified ·
1 Parent(s): b550729

Upload apertus_ui.py

Browse files
Files changed (1) hide show
  1. apertus_ui.py +377 -0
apertus_ui.py ADDED
@@ -0,0 +1,377 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer
3
+ import torch
4
+ import os
5
+ import xml.etree.ElementTree as ET
6
+ import re
7
+ from coptic_keyboard import coptic_keyboard
8
+ from coptic_morphology import analyze_coptic_morphology, CopticMorphologyTokenizer
9
+ from morphology_informed_translation import get_morphology_enhanced_translation
10
+
11
+
12
+ #Coptic alphabet helper
13
+ COPTIC_ALPHABET = {
14
+ # 'Ⲁ': 'Alpha', 'Ⲃ': 'Beta', 'Ⲅ': 'Gamma', 'Ⲇ': 'Delta', 'Ⲉ': 'Epsilon', 'Ⲋ': 'Zeta',
15
+ 'Ⲏ': 'Eta', 'Ⲑ': 'Theta', 'Ⲓ': 'Iota', 'Ⲕ': 'Kappa', 'Ⲗ': 'Lambda', 'Ⲙ': 'Mu',
16
+ 'Ⲛ': 'Nu', 'Ⲝ': 'Xi', 'Ⲟ': 'Omicron', 'Ⲡ': 'Pi', 'Ⲣ': 'Rho', 'Ⲥ': 'Sigma',
17
+ 'Ⲧ': 'Tau', 'Ⲩ': 'Upsilon', 'Ⲫ': 'Phi', 'Ⲭ': 'Chi', 'Ⲯ': 'Psi', 'Ⲱ': 'Omega',
18
+ 'Ϣ': 'Shai', 'Ϥ': 'Fai', 'Ϧ': 'Khei', 'Ϩ': 'Hori', 'Ϫ': 'Gangia', 'Ϭ': 'Shima', 'Ϯ': 'Ti'
19
+ }
20
+
21
+ # Coptic linguistic prompts
22
+ COPTIC_PROMPTS = {
23
+ 'dialect_analysis': "Analyze the Coptic dialect of this text and identify linguistic features:",
24
+ 'translation': "Translate this Coptic text to English, preserving theological and cultural context:",
25
+ 'transcription': "Provide a romanized transcription of this Coptic text:",
26
+ 'morphology': "Analyze the morphological structure of these Coptic words:",
27
+ 'lexicon_lookup': "Look up these Coptic words in the lexicon and provide Greek etymologies:"
28
+ }
29
+
30
+ # Lexicon loader
31
+ @st.cache_data
32
+ def load_coptic_lexicon(file_path=None):
33
+ """Load Coptic lexicon from various formats including TEI XML"""
34
+ if not file_path or not os.path.exists(file_path):
35
+ return {}
36
+
37
+ lexicon = {}
38
+
39
+ try:
40
+ # Handle XML format (TEI structure for Comprehensive Coptic Lexicon)
41
+ if file_path.endswith('.xml'):
42
+ tree = ET.parse(file_path)
43
+ root = tree.getroot()
44
+
45
+ # Handle TEI namespace
46
+ ns = {'tei': 'http://www.tei-c.org/ns/1.0'}
47
+
48
+ # Find entries in TEI format
49
+ entries = root.findall('.//tei:entry', ns)
50
+
51
+ for entry in entries: # Load ALL entries, no limit
52
+ coptic_word = ""
53
+ definition = ""
54
+
55
+ # Extract Coptic headword from TEI structure
56
+ coptic_word = ""
57
+ orth_elem = entry.find('.//tei:orth', ns)
58
+ if orth_elem is not None and orth_elem.text:
59
+ coptic_word = orth_elem.text.strip()
60
+
61
+ # Extract definition - try multiple approaches
62
+ definition = ""
63
+
64
+ # Try def elements
65
+ def_elems = entry.findall('.//tei:def', ns)
66
+ if def_elems:
67
+ definitions = [d.text.strip() for d in def_elems if d.text]
68
+ definition = "; ".join(definitions[:3])
69
+
70
+ # If no def, try cit elements
71
+ if not definition:
72
+ cit_elems = entry.findall('.//tei:cit', ns)
73
+ if cit_elems:
74
+ definitions = [c.text.strip() for c in cit_elems if c.text]
75
+ definition = "; ".join(definitions[:2])
76
+
77
+ # Store if we have both word and definition
78
+ if coptic_word and definition:
79
+ # Less aggressive cleaning - keep Coptic Unicode
80
+ if any('\u2C80' <= char <= '\u2CFF' for char in coptic_word):
81
+ lexicon[coptic_word] = definition[:400]
82
+
83
+ # Handle text formats
84
+ else:
85
+ with open(file_path, 'r', encoding='utf-8') as f:
86
+ for line in f:
87
+ line = line.strip()
88
+ if not line:
89
+ continue
90
+
91
+ # Support multiple separators
92
+ separator = None
93
+ for sep in ['\t', '|', ',', ';']:
94
+ if sep in line:
95
+ separator = sep
96
+ break
97
+
98
+ if separator:
99
+ parts = line.split(separator, 1)
100
+ if len(parts) >= 2:
101
+ coptic_word = parts[0].strip()
102
+ definition = parts[1].strip()
103
+ lexicon[coptic_word] = definition
104
+
105
+ except Exception as e:
106
+ st.error(f"Error loading lexicon: {str(e)}")
107
+
108
+ return lexicon
109
+
110
+ # Translation settings
111
+ st.set_page_config(page_title="Coptic Translation Interface", layout="wide")
112
+
113
+ # Clear translation direction
114
+ col1, col2 = st.columns(2)
115
+ with col1:
116
+ st.write("**Source:** Coptic (ⲘⲉⲧⲢⲉⲙ̀ⲛⲭⲏⲙⲓ)")
117
+ with col2:
118
+ target_lang = st.selectbox("**Target Language:**",
119
+ ["English", "Français", "Deutsch", "Español"],
120
+ key="target_language")
121
+
122
+ # Sidebar for Coptic tools
123
+ with st.sidebar:
124
+ st.header("Coptic Tools")
125
+
126
+ # Lexicon file uploader
127
+ lexicon_file = st.file_uploader("Upload Coptic Lexicon",
128
+ type=['txt', 'tsv', 'csv', 'xml'],
129
+ help="Supports: Text (TAB/pipe separated), XML (Crum format), CSV")
130
+
131
+ # Load lexicon
132
+ if lexicon_file:
133
+ # Save uploaded file temporarily
134
+ with open("temp_lexicon.txt", "wb") as f:
135
+ f.write(lexicon_file.getbuffer())
136
+ coptic_lexicon = load_coptic_lexicon("temp_lexicon.txt")
137
+ st.success(f"Loaded {len(coptic_lexicon)} lexicon entries")
138
+ else:
139
+ # Try to load the comprehensive lexicon if available
140
+ comprehensive_lexicon_path = "Comprehensive_Coptic_Lexicon-v1.2-2020.xml"
141
+ if os.path.exists(comprehensive_lexicon_path):
142
+ coptic_lexicon = load_coptic_lexicon(comprehensive_lexicon_path)
143
+ if coptic_lexicon:
144
+ st.info(f"Loaded Comprehensive Coptic Lexicon: {len(coptic_lexicon)} entries")
145
+ else:
146
+ coptic_lexicon = {}
147
+ else:
148
+ coptic_lexicon = {}
149
+
150
+ # Coptic alphabet reference
151
+ if st.expander("Coptic Alphabet"):
152
+ for letter, name in COPTIC_ALPHABET.items():
153
+ st.text(f"{letter} - {name}")
154
+
155
+ # Lexicon search with working methods
156
+ if coptic_lexicon:
157
+ st.subheader("Lexicon Search")
158
+
159
+ # Method selection for search
160
+ search_method = st.radio("Input method:",
161
+ ["Latin → Coptic", "Paste Coptic Text"],
162
+ key="search_method")
163
+
164
+ search_term = ""
165
+
166
+ if search_method == "Latin → Coptic":
167
+ # Method 1: Transliteration
168
+ transliteration_map = {
169
+ 'a': 'ⲁ', 'b': 'ⲃ', 'g': 'ⲅ', 'd': 'ⲇ', 'e': 'ⲉ', 'z': 'ⲍ',
170
+ 'h': 'ⲏ', 'q': 'ⲑ', 'i': 'ⲓ', 'k': 'ⲕ', 'l': 'ⲗ', 'm': 'ⲙ',
171
+ 'n': 'ⲛ', 'x': 'ⲝ', 'o': 'ⲟ', 'p': 'ⲡ', 'r': 'ⲣ', 's': 'ⲥ',
172
+ 't': 'ⲧ', 'u': 'ⲩ', 'f': 'ⲫ', 'c': 'ⲭ', 'y': 'ⲯ', 'w': 'ⲱ',
173
+ 'S': 'ϣ', 'F': 'ϥ', 'X': 'ϧ', 'H': 'ϩ', 'J': 'ϫ', 'C': 'ϭ', 'T': 'ϯ'
174
+ }
175
+
176
+ latin_input = st.text_input("Type Latin (a=ⲁ, noute=ⲛⲟⲩⲧⲉ):", key="lexicon_latin")
177
+
178
+ if latin_input:
179
+ search_term = ""
180
+ for char in latin_input:
181
+ search_term += transliteration_map.get(char, char)
182
+ st.write(f"**Searching for:** {search_term}")
183
+
184
+ else:
185
+ # Method 3: External Coptic text
186
+ pasted_text = st.text_input("Paste Coptic text:", key="lexicon_coptic")
187
+
188
+ if pasted_text:
189
+ # Check if it contains Coptic Unicode
190
+ is_coptic = any(0x2C80 <= ord(char) <= 0x2CFF for char in pasted_text)
191
+
192
+ if is_coptic:
193
+ st.success("✅ Coptic Unicode detected")
194
+ search_term = pasted_text
195
+ else:
196
+ st.warning("⚠️ Converting PDF text to Coptic Unicode")
197
+
198
+ # Convert common PDF/Greek characters to Coptic
199
+ pdf_to_coptic = {
200
+ 'α': 'ⲁ', 'β': 'ⲃ', 'γ': 'ⲅ', 'δ': 'ⲇ', 'ε': 'ⲉ', 'ζ': 'ⲍ',
201
+ 'η': 'ⲏ', 'θ': 'ⲑ', 'ι': 'ⲓ', 'κ': 'ⲕ', 'λ': 'ⲗ', 'μ': 'ⲙ',
202
+ 'ν': 'ⲛ', 'ξ': 'ⲝ', 'ο': 'ⲟ', 'π': 'ⲡ', 'ρ': 'ⲣ', 'σ': 'ⲥ',
203
+ 'τ': 'ⲧ', 'υ': 'ⲩ', 'φ': 'ⲫ', 'χ': 'ⲭ', 'ψ': 'ⲯ', 'ω': 'ⲱ',
204
+ 'ς': 'ⲥ', 'ϣ': 'ϣ', 'ϥ': 'ϥ', 'ϧ': 'ϧ', 'ϩ': 'ϩ', 'ϫ': 'ϫ', 'ϭ': 'ϭ', 'ϯ': 'ϯ',
205
+ # Latin fallbacks
206
+ 'a': 'ⲁ', 'b': 'ⲃ', 'g': 'ⲅ', 'd': 'ⲇ', 'e': 'ⲉ', 'z': 'ⲍ',
207
+ 'h': 'ⲏ', 'q': 'ⲑ', 'i': 'ⲓ', 'k': 'ⲕ', 'l': 'ⲗ', 'm': 'ⲙ',
208
+ 'n': 'ⲛ', 'x': 'ⲝ', 'o': 'ⲟ', 'p': 'ⲡ', 'r': 'ⲣ', 's': 'ⲥ',
209
+ 't': 'ⲧ', 'u': 'ⲩ', 'f': 'ⲫ', 'c': 'ⲭ', 'y': 'ⲯ', 'w': 'ⲱ'
210
+ }
211
+
212
+ converted = ""
213
+ for char in pasted_text:
214
+ converted += pdf_to_coptic.get(char, char)
215
+
216
+ search_term = converted
217
+ st.write(f"**Converted to:** {converted}")
218
+
219
+ # Perform search
220
+ if search_term:
221
+ # Exact match first
222
+ if search_term in coptic_lexicon:
223
+ st.success(f"**Exact Match: {search_term}**")
224
+ st.markdown(f"**Definition:** {coptic_lexicon[search_term]}")
225
+ st.divider()
226
+
227
+ # Partial matches (starts with)
228
+ starts_with = [k for k in coptic_lexicon.keys() if k.startswith(search_term) and k != search_term]
229
+ if starts_with:
230
+ st.write("**Words starting with your search:**")
231
+ for match in starts_with[:8]:
232
+ with st.expander(f"📖 {match}"):
233
+ st.write(coptic_lexicon[match])
234
+ st.divider()
235
+
236
+ # Contains matches
237
+ contains = [k for k in coptic_lexicon.keys() if search_term in k and not k.startswith(search_term)]
238
+ if contains:
239
+ st.write("**Words containing your search:**")
240
+ for match in contains[:5]:
241
+ with st.expander(f"📖 {match}"):
242
+ st.write(coptic_lexicon[match])
243
+
244
+ # If no matches at all
245
+ if not (search_term in coptic_lexicon or starts_with or contains):
246
+ st.error("❌ No matches found in lexicon")
247
+ st.info(f"Searched for: **{search_term}** | Available entries: {len(coptic_lexicon)}")
248
+
249
+ # Load model (cached)
250
+ @st.cache_resource
251
+ def load_model():
252
+ model_path = "swiss-ai/Apertus-8B-Instruct-2509"
253
+ try:
254
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
255
+ model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.bfloat16)
256
+ return tokenizer, model
257
+ except Exception as e:
258
+ st.error(f"Failed to load model: {str(e)}")
259
+ return None, None
260
+
261
+ tokenizer, model = load_model()
262
+
263
+ # Check if model loaded successfully
264
+ if tokenizer is None or model is None:
265
+ st.error("❌ Model failed to load. Translation unavailable.")
266
+ st.stop()
267
+
268
+ # Morphological Analysis Section
269
+ st.subheader("🔍 Morphological Analysis")
270
+
271
+ morph_text = st.text_area(
272
+ "Enter Coptic text for morphological analysis:",
273
+ height=100,
274
+ placeholder="ⲡⲉϫⲉⲡⲛⲟⲩⲧⲉⲛⲛⲁϩⲣⲛⲡⲓⲥⲣⲁⲏⲗ..."
275
+ )
276
+
277
+ if st.button("Analyze Morphology"):
278
+ if morph_text.strip():
279
+ with st.spinner("Analyzing morphology..."):
280
+ analysis = analyze_coptic_morphology(morph_text)
281
+
282
+ st.subheader("Morphological Breakdown:")
283
+ st.text(analysis)
284
+
285
+ with st.expander("Detailed Analysis"):
286
+ tokenizer_morph = CopticMorphologyTokenizer()
287
+ analyses = tokenizer_morph.tokenize_text(morph_text)
288
+
289
+ for i, word_analysis in enumerate(analyses):
290
+ if word_analysis['morphemes']:
291
+ st.write(f"**Word {i+1}: {word_analysis['word']}**")
292
+ for morpheme in word_analysis['morphemes']:
293
+ st.write(f" - {morpheme['form']} ({morpheme['type']}: {morpheme['function']})")
294
+ st.write("---")
295
+ else:
296
+ st.warning("Please enter some Coptic text to analyze.")
297
+
298
+ # Enhanced translation with morphological context
299
+ col1, col2 = st.columns(2)
300
+ with col1:
301
+ if st.button("🧠 Enhanced Translation (with morphology)"):
302
+ if morph_text.strip():
303
+ with st.spinner("Generating morphology-enhanced translation..."):
304
+ enhanced_translation = get_morphology_enhanced_translation(
305
+ morph_text, tokenizer, model, "English"
306
+ )
307
+ st.subheader("Enhanced Translation:")
308
+ st.write(enhanced_translation)
309
+ else:
310
+ st.warning("Please enter Coptic text first.")
311
+
312
+ with col2:
313
+ if st.button("📝 Standard Translation"):
314
+ if morph_text.strip():
315
+ with st.spinner("Generating standard translation..."):
316
+ standard_prompt = f"Translate this Coptic text to English: {morph_text}"
317
+ messages = [{"role": "user", "content": standard_prompt}]
318
+ text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
319
+ inputs = tokenizer([text], return_tensors="pt")
320
+
321
+ with torch.no_grad():
322
+ outputs = model.generate(**inputs, max_new_tokens=300, temperature=0.6, top_p=0.9, do_sample=True)
323
+
324
+ response = tokenizer.decode(outputs[0][len(inputs.input_ids[0]):], skip_special_tokens=True)
325
+ st.subheader("Standard Translation:")
326
+ st.write(response)
327
+ else:
328
+ st.warning("Please enter Coptic text first.")
329
+
330
+ # Chat interface
331
+ if "messages" not in st.session_state:
332
+ st.session_state.messages = []
333
+
334
+ # Display chat history
335
+ for message in st.session_state.messages:
336
+ with st.chat_message(message["role"]):
337
+ st.markdown(message["content"])
338
+
339
+ # User input
340
+ if prompt := st.chat_input("Enter Coptic text to translate..."):
341
+ # Convert to Coptic Unicode
342
+ char_to_coptic = {
343
+ 'α': 'ⲁ', 'β': 'ⲃ', 'γ': 'ⲅ', 'δ': 'ⲇ', 'ε': 'ⲉ', 'ζ': 'ⲍ',
344
+ 'η': 'ⲏ', 'θ': 'ⲑ', 'ι': 'ⲓ', 'κ': 'ⲕ', 'λ': 'ⲗ', 'μ': 'ⲙ',
345
+ 'ν': 'ⲛ', 'ξ': 'ⲝ', 'ο': 'ⲟ', 'π': 'ⲡ', 'ρ': 'ⲣ', 'σ': 'ⲥ',
346
+ 'τ': 'ⲧ', 'υ': 'ⲩ', 'φ': 'ⲫ', 'χ': 'ⲭ', 'ψ': 'ⲯ', 'ω': 'ⲱ', 'ς': 'ⲥ',
347
+ 'a': 'ⲁ', 'b': 'ⲃ', 'g': 'ⲅ', 'd': 'ⲇ', 'e': 'ⲉ', 'z': 'ⲍ',
348
+ 'h': 'ⲏ', 'q': 'ⲑ', 'i': 'ⲓ', 'k': 'ⲕ', 'l': 'ⲗ', 'm': 'ⲙ',
349
+ 'n': 'ⲛ', 'x': 'ⲝ', 'o': 'ⲟ', 'p': 'ⲡ', 'r': 'ⲣ', 's': 'ⲥ',
350
+ 't': 'ⲧ', 'u': 'ⲩ', 'f': 'ⲫ', 'c': 'ⲭ', 'y': 'ⲯ', 'w': 'ⲱ',
351
+ 'S': 'ϣ', 'F': 'ϥ', 'X': 'ϧ', 'H': 'ϩ', 'J': 'ϫ', 'C': 'ϭ', 'T': 'ϯ'
352
+ }
353
+
354
+ coptic_text = "".join(char_to_coptic.get(char, char) for char in prompt)
355
+
356
+ # Display user input
357
+ st.session_state.messages.append({"role": "user", "content": coptic_text})
358
+ with st.chat_message("user"):
359
+ st.markdown(coptic_text)
360
+
361
+ # Generate translation
362
+ translation_prompt = f"You are a Coptic language expert. Translate this Coptic text to {target_lang} and provide the meaning: {coptic_text}"
363
+
364
+ with st.chat_message("assistant"):
365
+ try:
366
+ messages = [{"role": "user", "content": translation_prompt}]
367
+ text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
368
+ inputs = tokenizer([text], return_tensors="pt")
369
+
370
+ with torch.no_grad():
371
+ outputs = model.generate(**inputs, max_new_tokens=300, temperature=0.6, top_p=0.9, do_sample=True)
372
+
373
+ response = tokenizer.decode(outputs[0][len(inputs.input_ids[0]):], skip_special_tokens=True)
374
+ st.markdown(response)
375
+ st.session_state.messages.append({"role": "assistant", "content": response})
376
+ except Exception as e:
377
+ st.error(f"Translation error: {str(e)}")