import gradio as gr from nltk import pos_tag, ne_chunk from nltk.tokenize import word_tokenize, sent_tokenize from nltk.tokenize.treebank import TreebankWordDetokenizer from nltk.corpus import wordnet, brown from spellchecker import SpellChecker import re import nltk nltk.download('brown') nltk.download('punkt') nltk.download('averaged_perceptron_tagger') nltk.download('maxent_ne_chunker') nltk.download('words') nltk.download('wordnet') # Preload resources detokenizer = TreebankWordDetokenizer() global_freq_dist = nltk.FreqDist(w.lower() for w in brown.words()) def extract_proper_nouns(text): tokens = word_tokenize(text) tagged = pos_tag(tokens) chunks = ne_chunk(tagged) proper_nouns = set() for chunk in chunks: if hasattr(chunk, 'label') and chunk.label() in ('PERSON', 'ORGANIZATION', 'GPE'): entity = " ".join(c[0] for c in chunk) proper_nouns.add(entity.lower()) return proper_nouns def protect_proper_nouns(text, dynamic_entities): tokens = word_tokenize(text) protected = [] for token in tokens: lower_token = token.lower() if any(lower_token in entity for entity in dynamic_entities): protected.append((token, True)) else: protected.append((token, False)) return protected def replace_words_with_synonyms(tokens_with_protection, pos_tags): new_tokens = [] for (token, protected), (_, tag) in zip(tokens_with_protection, pos_tags): if protected or not token.isalpha(): new_tokens.append(token) continue pos = None if tag.startswith('JJ'): pos = wordnet.ADJ elif tag.startswith('RB'): pos = wordnet.ADV elif tag.startswith('NN'): pos = wordnet.NOUN elif tag.startswith('VB'): pos = wordnet.VERB if pos: candidates = [] for syn in wordnet.synsets(token, pos=pos): for lemma in syn.lemmas(): candidate = lemma.name().replace('_', ' ') if candidate.lower() == token.lower() or ' ' in candidate: continue if len(candidate) > len(token) * 1.2: continue candidates.append(candidate) if candidates: best_candidate = max(candidates, key=lambda x: global_freq_dist[x.lower()], default=token) new_tokens.append(best_candidate) else: new_tokens.append(token) else: new_tokens.append(token) return new_tokens def restructure_sentences(text): sentences = sent_tokenize(text) restructured = [] for sent in sentences: if len(sent.split()) > 25: parts = re.split(r'[,;]', sent) if len(parts) > 1: sent = parts[0] + '. ' + ' '.join(parts[1:]) restructured.append(sent) return ' '.join(restructured) def contextual_spell_check(text): protected = protect_proper_nouns(text, extract_proper_nouns(text)) spell = SpellChecker() corrected = [] for token, protected_flag in protected: if protected_flag or not token.isalpha(): corrected.append(token) continue correction = spell.correction(token) if correction: if token[0].isupper(): corrected.append(correction.capitalize()) else: corrected.append(correction.lower()) else: corrected.append(token) return detokenizer.detokenize(corrected) def finalize_formatting(text): text = re.sub(r'\s+([.,!?;:])', r'\1', text) text = re.sub(r'([(])\s+', r'\1', text) text = re.sub(r'\s+([)])', r'\1', text) text = re.sub(r'\"(.*?)\"', r'“\1”', text) text = re.sub(r' -- ', r' — ', text) return text.strip() def humanize_text(text): dynamic_entities = extract_proper_nouns(text) protected_tokens = protect_proper_nouns(text, dynamic_entities) tokens = [t[0] for t in protected_tokens] tags = pos_tag(tokens) new_tokens = replace_words_with_synonyms(protected_tokens, tags) text = detokenizer.detokenize(new_tokens) return restructure_sentences(text) def process_text(input_text): if input_text: humanized = humanize_text(input_text) spell_checked = contextual_spell_check(humanized) final_output = finalize_formatting(spell_checked) return final_output return "" # Gradio Interface iface = gr.Interface( fn=process_text, inputs=gr.Textbox(lines=5, placeholder="Enter text here..."), outputs=gr.Textbox(lines=5), title="AI to Humanized Text Converter", description="Enter text and get a more humanized, readable output.", ) iface.launch()