Spaces:

Norelad
/

coptic-translation-interface

Sleeping

App Files Files Community

Norelad commited on Oct 17

Commit

7e208b2

verified ·

1 Parent(s): b550729

Upload apertus_ui.py

Browse files

Files changed (1) hide show

apertus_ui.py +377 -0

apertus_ui.py ADDED Viewed

	@@ -0,0 +1,377 @@

+import streamlit as st
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+import os
+import xml.etree.ElementTree as ET
+import re
+from coptic_keyboard import coptic_keyboard
+from coptic_morphology import analyze_coptic_morphology, CopticMorphologyTokenizer
+from morphology_informed_translation import get_morphology_enhanced_translation
+#Coptic alphabet helper
+COPTIC_ALPHABET = {
+#    'Ⲁ': 'Alpha', 'Ⲃ': 'Beta', 'Ⲅ': 'Gamma', 'Ⲇ': 'Delta', 'Ⲉ': 'Epsilon', 'Ⲋ': 'Zeta',
+    'Ⲏ': 'Eta', 'Ⲑ': 'Theta', 'Ⲓ': 'Iota', 'Ⲕ': 'Kappa', 'Ⲗ': 'Lambda', 'Ⲙ': 'Mu',
+    'Ⲛ': 'Nu', 'Ⲝ': 'Xi', 'Ⲟ': 'Omicron', 'Ⲡ': 'Pi', 'Ⲣ': 'Rho', 'Ⲥ': 'Sigma',
+    'Ⲧ': 'Tau', 'Ⲩ': 'Upsilon', 'Ⲫ': 'Phi', 'Ⲭ': 'Chi', 'Ⲯ': 'Psi', 'Ⲱ': 'Omega',
+    'Ϣ': 'Shai', 'Ϥ': 'Fai', 'Ϧ': 'Khei', 'Ϩ': 'Hori', 'Ϫ': 'Gangia', 'Ϭ': 'Shima', 'Ϯ': 'Ti'
+}
+# Coptic linguistic prompts
+COPTIC_PROMPTS = {
+    'dialect_analysis': "Analyze the Coptic dialect of this text and identify linguistic features:",
+    'translation': "Translate this Coptic text to English, preserving theological and cultural context:",
+    'transcription': "Provide a romanized transcription of this Coptic text:",
+    'morphology': "Analyze the morphological structure of these Coptic words:",
+    'lexicon_lookup': "Look up these Coptic words in the lexicon and provide Greek etymologies:"
+}
+# Lexicon loader
+@st.cache_data
+def load_coptic_lexicon(file_path=None):
+    """Load Coptic lexicon from various formats including TEI XML"""
+    if not file_path or not os.path.exists(file_path):
+        return {}
+    lexicon = {}
+    try:
+        # Handle XML format (TEI structure for Comprehensive Coptic Lexicon)
+        if file_path.endswith('.xml'):
+            tree = ET.parse(file_path)
+            root = tree.getroot()
+            # Handle TEI namespace
+            ns = {'tei': 'http://www.tei-c.org/ns/1.0'}
+            # Find entries in TEI format
+            entries = root.findall('.//tei:entry', ns)
+            for entry in entries:  # Load ALL entries, no limit
+                coptic_word = ""
+                definition = ""
+                # Extract Coptic headword from TEI structure
+                coptic_word = ""
+                orth_elem = entry.find('.//tei:orth', ns)
+                if orth_elem is not None and orth_elem.text:
+                    coptic_word = orth_elem.text.strip()
+                # Extract definition - try multiple approaches
+                definition = ""
+                # Try def elements
+                def_elems = entry.findall('.//tei:def', ns)
+                if def_elems:
+                    definitions = [d.text.strip() for d in def_elems if d.text]
+                    definition = "; ".join(definitions[:3])
+                # If no def, try cit elements
+                if not definition:
+                    cit_elems = entry.findall('.//tei:cit', ns)
+                    if cit_elems:
+                        definitions = [c.text.strip() for c in cit_elems if c.text]
+                        definition = "; ".join(definitions[:2])
+                # Store if we have both word and definition
+                if coptic_word and definition:
+                    # Less aggressive cleaning - keep Coptic Unicode
+                    if any('\u2C80' <= char <= '\u2CFF' for char in coptic_word):
+                        lexicon[coptic_word] = definition[:400]
+        # Handle text formats
+        else:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                for line in f:
+                    line = line.strip()
+                    if not line:
+                        continue
+                    # Support multiple separators
+                    separator = None
+                    for sep in ['\t', '|', ',', ';']:
+                        if sep in line:
+                            separator = sep
+                            break
+                    if separator:
+                        parts = line.split(separator, 1)
+                        if len(parts) >= 2:
+                            coptic_word = parts[0].strip()
+                            definition = parts[1].strip()
+                            lexicon[coptic_word] = definition
+    except Exception as e:
+        st.error(f"Error loading lexicon: {str(e)}")
+    return lexicon
+# Translation settings
+st.set_page_config(page_title="Coptic Translation Interface", layout="wide")
+# Clear translation direction
+col1, col2 = st.columns(2)
+with col1:
+    st.write("**Source:** Coptic (ⲘⲉⲧⲢⲉⲙ̀ⲛⲭⲏⲙⲓ)")
+with col2:
+    target_lang = st.selectbox("**Target Language:**",
+                              ["English", "Français", "Deutsch", "Español"],
+                              key="target_language")
+# Sidebar for Coptic tools
+with st.sidebar:
+    st.header("Coptic Tools")
+    # Lexicon file uploader
+    lexicon_file = st.file_uploader("Upload Coptic Lexicon",
+                                   type=['txt', 'tsv', 'csv', 'xml'],
+                                   help="Supports: Text (TAB/pipe separated), XML (Crum format), CSV")
+    # Load lexicon
+    if lexicon_file:
+        # Save uploaded file temporarily
+        with open("temp_lexicon.txt", "wb") as f:
+            f.write(lexicon_file.getbuffer())
+        coptic_lexicon = load_coptic_lexicon("temp_lexicon.txt")
+        st.success(f"Loaded {len(coptic_lexicon)} lexicon entries")
+    else:
+        # Try to load the comprehensive lexicon if available
+        comprehensive_lexicon_path = "Comprehensive_Coptic_Lexicon-v1.2-2020.xml"
+        if os.path.exists(comprehensive_lexicon_path):
+            coptic_lexicon = load_coptic_lexicon(comprehensive_lexicon_path)
+            if coptic_lexicon:
+                st.info(f"Loaded Comprehensive Coptic Lexicon: {len(coptic_lexicon)} entries")
+            else:
+                coptic_lexicon = {}
+        else:
+            coptic_lexicon = {}
+    # Coptic alphabet reference
+    if st.expander("Coptic Alphabet"):
+        for letter, name in COPTIC_ALPHABET.items():
+            st.text(f"{letter} - {name}")
+    # Lexicon search with working methods
+    if coptic_lexicon:
+        st.subheader("Lexicon Search")
+        # Method selection for search
+        search_method = st.radio("Input method:",
+                               ["Latin → Coptic", "Paste Coptic Text"],
+                               key="search_method")
+        search_term = ""
+        if search_method == "Latin → Coptic":
+            # Method 1: Transliteration
+            transliteration_map = {
+                'a': 'ⲁ', 'b': 'ⲃ', 'g': 'ⲅ', 'd': 'ⲇ', 'e': 'ⲉ', 'z': 'ⲍ',
+                'h': 'ⲏ', 'q': 'ⲑ', 'i': 'ⲓ', 'k': 'ⲕ', 'l': 'ⲗ', 'm': 'ⲙ',
+                'n': 'ⲛ', 'x': 'ⲝ', 'o': 'ⲟ', 'p': 'ⲡ', 'r': 'ⲣ', 's': 'ⲥ',
+                't': 'ⲧ', 'u': 'ⲩ', 'f': 'ⲫ', 'c': 'ⲭ', 'y': 'ⲯ', 'w': 'ⲱ',
+                'S': 'ϣ', 'F': 'ϥ', 'X': 'ϧ', 'H': 'ϩ', 'J': 'ϫ', 'C': 'ϭ', 'T': 'ϯ'
+            }
+            latin_input = st.text_input("Type Latin (a=ⲁ, noute=ⲛⲟⲩⲧⲉ):", key="lexicon_latin")
+            if latin_input:
+                search_term = ""
+                for char in latin_input:
+                    search_term += transliteration_map.get(char, char)
+                st.write(f"**Searching for:** {search_term}")
+        else:
+            # Method 3: External Coptic text
+            pasted_text = st.text_input("Paste Coptic text:", key="lexicon_coptic")
+            if pasted_text:
+                # Check if it contains Coptic Unicode
+                is_coptic = any(0x2C80 <= ord(char) <= 0x2CFF for char in pasted_text)
+                if is_coptic:
+                    st.success("✅ Coptic Unicode detected")
+                    search_term = pasted_text
+                else:
+                    st.warning("⚠️ Converting PDF text to Coptic Unicode")
+                    # Convert common PDF/Greek characters to Coptic
+                    pdf_to_coptic = {
+                        'α': 'ⲁ', 'β': 'ⲃ', 'γ': 'ⲅ', 'δ': 'ⲇ', 'ε': 'ⲉ', 'ζ': 'ⲍ',
+                        'η': 'ⲏ', 'θ': 'ⲑ', 'ι': 'ⲓ', 'κ': 'ⲕ', 'λ': 'ⲗ', 'μ': 'ⲙ',
+                        'ν': 'ⲛ', 'ξ': 'ⲝ', 'ο': 'ⲟ', 'π': 'ⲡ', 'ρ': 'ⲣ', 'σ': 'ⲥ',
+                        'τ': 'ⲧ', 'υ': 'ⲩ', 'φ': 'ⲫ', 'χ': 'ⲭ', 'ψ': 'ⲯ', 'ω': 'ⲱ',
+                        'ς': 'ⲥ', 'ϣ': 'ϣ', 'ϥ': 'ϥ', 'ϧ': 'ϧ', 'ϩ': 'ϩ', 'ϫ': 'ϫ', 'ϭ': 'ϭ', 'ϯ': 'ϯ',
+                        # Latin fallbacks
+                        'a': 'ⲁ', 'b': 'ⲃ', 'g': 'ⲅ', 'd': 'ⲇ', 'e': 'ⲉ', 'z': 'ⲍ',
+                        'h': 'ⲏ', 'q': 'ⲑ', 'i': 'ⲓ', 'k': 'ⲕ', 'l': 'ⲗ', 'm': 'ⲙ',
+                        'n': 'ⲛ', 'x': 'ⲝ', 'o': 'ⲟ', 'p': 'ⲡ', 'r': 'ⲣ', 's': 'ⲥ',
+                        't': 'ⲧ', 'u': 'ⲩ', 'f': 'ⲫ', 'c': 'ⲭ', 'y': 'ⲯ', 'w': 'ⲱ'
+                    }
+                    converted = ""
+                    for char in pasted_text:
+                        converted += pdf_to_coptic.get(char, char)
+                    search_term = converted
+                    st.write(f"**Converted to:** {converted}")
+        # Perform search
+        if search_term:
+            # Exact match first
+            if search_term in coptic_lexicon:
+                st.success(f"**Exact Match: {search_term}**")
+                st.markdown(f"**Definition:** {coptic_lexicon[search_term]}")
+                st.divider()
+            # Partial matches (starts with)
+            starts_with = [k for k in coptic_lexicon.keys() if k.startswith(search_term) and k != search_term]
+            if starts_with:
+                st.write("**Words starting with your search:**")
+                for match in starts_with[:8]:
+                    with st.expander(f"📖 {match}"):
+                        st.write(coptic_lexicon[match])
+                st.divider()
+            # Contains matches
+            contains = [k for k in coptic_lexicon.keys() if search_term in k and not k.startswith(search_term)]
+            if contains:
+                st.write("**Words containing your search:**")
+                for match in contains[:5]:
+                    with st.expander(f"📖 {match}"):
+                        st.write(coptic_lexicon[match])
+            # If no matches at all
+            if not (search_term in coptic_lexicon or starts_with or contains):
+                st.error("❌ No matches found in lexicon")
+                st.info(f"Searched for: **{search_term}** | Available entries: {len(coptic_lexicon)}")
+# Load model (cached)
+@st.cache_resource
+def load_model():
+    model_path = "swiss-ai/Apertus-8B-Instruct-2509"
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(model_path)
+        model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.bfloat16)
+        return tokenizer, model
+    except Exception as e:
+        st.error(f"Failed to load model: {str(e)}")
+        return None, None
+tokenizer, model = load_model()
+# Check if model loaded successfully
+if tokenizer is None or model is None:
+    st.error("❌ Model failed to load. Translation unavailable.")
+    st.stop()
+# Morphological Analysis Section
+st.subheader("🔍 Morphological Analysis")
+morph_text = st.text_area(
+    "Enter Coptic text for morphological analysis:",
+    height=100,
+    placeholder="ⲡⲉϫⲉⲡⲛⲟⲩⲧⲉⲛⲛⲁϩⲣⲛⲡⲓⲥⲣⲁⲏⲗ..."
+)
+if st.button("Analyze Morphology"):
+    if morph_text.strip():
+        with st.spinner("Analyzing morphology..."):
+            analysis = analyze_coptic_morphology(morph_text)
+        st.subheader("Morphological Breakdown:")
+        st.text(analysis)
+        with st.expander("Detailed Analysis"):
+            tokenizer_morph = CopticMorphologyTokenizer()
+            analyses = tokenizer_morph.tokenize_text(morph_text)
+            for i, word_analysis in enumerate(analyses):
+                if word_analysis['morphemes']:
+                    st.write(f"**Word {i+1}: {word_analysis['word']}**")
+                    for morpheme in word_analysis['morphemes']:
+                        st.write(f"  - {morpheme['form']} ({morpheme['type']}: {morpheme['function']})")
+                    st.write("---")
+    else:
+        st.warning("Please enter some Coptic text to analyze.")
+# Enhanced translation with morphological context
+col1, col2 = st.columns(2)
+with col1:
+    if st.button("🧠 Enhanced Translation (with morphology)"):
+        if morph_text.strip():
+            with st.spinner("Generating morphology-enhanced translation..."):
+                enhanced_translation = get_morphology_enhanced_translation(
+                    morph_text, tokenizer, model, "English"
+                )
+            st.subheader("Enhanced Translation:")
+            st.write(enhanced_translation)
+        else:
+            st.warning("Please enter Coptic text first.")
+with col2:
+    if st.button("📝 Standard Translation"):
+        if morph_text.strip():
+            with st.spinner("Generating standard translation..."):
+                standard_prompt = f"Translate this Coptic text to English: {morph_text}"
+                messages = [{"role": "user", "content": standard_prompt}]
+                text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+                inputs = tokenizer([text], return_tensors="pt")
+                with torch.no_grad():
+                    outputs = model.generate(**inputs, max_new_tokens=300, temperature=0.6, top_p=0.9, do_sample=True)
+                response = tokenizer.decode(outputs[0][len(inputs.input_ids[0]):], skip_special_tokens=True)
+            st.subheader("Standard Translation:")
+            st.write(response)
+        else:
+            st.warning("Please enter Coptic text first.")
+# Chat interface
+if "messages" not in st.session_state:
+    st.session_state.messages = []
+# Display chat history
+for message in st.session_state.messages:
+    with st.chat_message(message["role"]):
+        st.markdown(message["content"])
+# User input
+if prompt := st.chat_input("Enter Coptic text to translate..."):
+    # Convert to Coptic Unicode
+    char_to_coptic = {
+        'α': 'ⲁ', 'β': 'ⲃ', 'γ': 'ⲅ', 'δ': 'ⲇ', 'ε': 'ⲉ', 'ζ': 'ⲍ',
+        'η': 'ⲏ', 'θ': 'ⲑ', 'ι': 'ⲓ', 'κ': 'ⲕ', 'λ': 'ⲗ', 'μ': 'ⲙ',
+        'ν': 'ⲛ', 'ξ': 'ⲝ', 'ο': 'ⲟ', 'π': 'ⲡ', 'ρ': 'ⲣ', 'σ': 'ⲥ',
+        'τ': 'ⲧ', 'υ': 'ⲩ', 'φ': 'ⲫ', 'χ': 'ⲭ', 'ψ': 'ⲯ', 'ω': 'ⲱ', 'ς': 'ⲥ',
+        'a': 'ⲁ', 'b': 'ⲃ', 'g': 'ⲅ', 'd': 'ⲇ', 'e': 'ⲉ', 'z': 'ⲍ',
+        'h': 'ⲏ', 'q': 'ⲑ', 'i': 'ⲓ', 'k': 'ⲕ', 'l': 'ⲗ', 'm': 'ⲙ',
+        'n': 'ⲛ', 'x': 'ⲝ', 'o': 'ⲟ', 'p': 'ⲡ', 'r': 'ⲣ', 's': 'ⲥ',
+        't': 'ⲧ', 'u': 'ⲩ', 'f': 'ⲫ', 'c': 'ⲭ', 'y': 'ⲯ', 'w': 'ⲱ',
+        'S': 'ϣ', 'F': 'ϥ', 'X': 'ϧ', 'H': 'ϩ', 'J': 'ϫ', 'C': 'ϭ', 'T': 'ϯ'
+    }
+    coptic_text = "".join(char_to_coptic.get(char, char) for char in prompt)
+    # Display user input
+    st.session_state.messages.append({"role": "user", "content": coptic_text})
+    with st.chat_message("user"):
+        st.markdown(coptic_text)
+    # Generate translation
+    translation_prompt = f"You are a Coptic language expert. Translate this Coptic text to {target_lang} and provide the meaning: {coptic_text}"
+    with st.chat_message("assistant"):
+        try:
+            messages = [{"role": "user", "content": translation_prompt}]
+            text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+            inputs = tokenizer([text], return_tensors="pt")
+            with torch.no_grad():
+                outputs = model.generate(**inputs, max_new_tokens=300, temperature=0.6, top_p=0.9, do_sample=True)
+            response = tokenizer.decode(outputs[0][len(inputs.input_ids[0]):], skip_special_tokens=True)
+            st.markdown(response)
+            st.session_state.messages.append({"role": "assistant", "content": response})
+        except Exception as e:
+            st.error(f"Translation error: {str(e)}")