Spaces:
Running
Running
| import gradio as gr | |
| from nltk import pos_tag, ne_chunk | |
| from nltk.tokenize import word_tokenize, sent_tokenize | |
| from nltk.tokenize.treebank import TreebankWordDetokenizer | |
| from nltk.corpus import wordnet, brown | |
| from spellchecker import SpellChecker | |
| import re | |
| import nltk | |
| nltk.download('brown') | |
| nltk.download('punkt') | |
| nltk.download('averaged_perceptron_tagger') | |
| nltk.download('maxent_ne_chunker') | |
| nltk.download('words') | |
| nltk.download('wordnet') | |
| # Preload resources | |
| detokenizer = TreebankWordDetokenizer() | |
| global_freq_dist = nltk.FreqDist(w.lower() for w in brown.words()) | |
| def extract_proper_nouns(text): | |
| tokens = word_tokenize(text) | |
| tagged = pos_tag(tokens) | |
| chunks = ne_chunk(tagged) | |
| proper_nouns = set() | |
| for chunk in chunks: | |
| if hasattr(chunk, 'label') and chunk.label() in ('PERSON', 'ORGANIZATION', 'GPE'): | |
| entity = " ".join(c[0] for c in chunk) | |
| proper_nouns.add(entity.lower()) | |
| return proper_nouns | |
| def protect_proper_nouns(text, dynamic_entities): | |
| tokens = word_tokenize(text) | |
| protected = [] | |
| for token in tokens: | |
| lower_token = token.lower() | |
| if any(lower_token in entity for entity in dynamic_entities): | |
| protected.append((token, True)) | |
| else: | |
| protected.append((token, False)) | |
| return protected | |
| def replace_words_with_synonyms(tokens_with_protection, pos_tags): | |
| new_tokens = [] | |
| for (token, protected), (_, tag) in zip(tokens_with_protection, pos_tags): | |
| if protected or not token.isalpha(): | |
| new_tokens.append(token) | |
| continue | |
| pos = None | |
| if tag.startswith('JJ'): | |
| pos = wordnet.ADJ | |
| elif tag.startswith('RB'): | |
| pos = wordnet.ADV | |
| elif tag.startswith('NN'): | |
| pos = wordnet.NOUN | |
| elif tag.startswith('VB'): | |
| pos = wordnet.VERB | |
| if pos: | |
| candidates = [] | |
| for syn in wordnet.synsets(token, pos=pos): | |
| for lemma in syn.lemmas(): | |
| candidate = lemma.name().replace('_', ' ') | |
| if candidate.lower() == token.lower() or ' ' in candidate: | |
| continue | |
| if len(candidate) > len(token) * 1.2: | |
| continue | |
| candidates.append(candidate) | |
| if candidates: | |
| best_candidate = max(candidates, | |
| key=lambda x: global_freq_dist[x.lower()], | |
| default=token) | |
| new_tokens.append(best_candidate) | |
| else: | |
| new_tokens.append(token) | |
| else: | |
| new_tokens.append(token) | |
| return new_tokens | |
| def restructure_sentences(text): | |
| sentences = sent_tokenize(text) | |
| restructured = [] | |
| for sent in sentences: | |
| if len(sent.split()) > 25: | |
| parts = re.split(r'[,;]', sent) | |
| if len(parts) > 1: | |
| sent = parts[0] + '. ' + ' '.join(parts[1:]) | |
| restructured.append(sent) | |
| return ' '.join(restructured) | |
| def contextual_spell_check(text): | |
| protected = protect_proper_nouns(text, extract_proper_nouns(text)) | |
| spell = SpellChecker() | |
| corrected = [] | |
| for token, protected_flag in protected: | |
| if protected_flag or not token.isalpha(): | |
| corrected.append(token) | |
| continue | |
| correction = spell.correction(token) | |
| if correction: | |
| if token[0].isupper(): | |
| corrected.append(correction.capitalize()) | |
| else: | |
| corrected.append(correction.lower()) | |
| else: | |
| corrected.append(token) | |
| return detokenizer.detokenize(corrected) | |
| def finalize_formatting(text): | |
| text = re.sub(r'\s+([.,!?;:])', r'\1', text) | |
| text = re.sub(r'([(])\s+', r'\1', text) | |
| text = re.sub(r'\s+([)])', r'\1', text) | |
| text = re.sub(r'\"(.*?)\"', r'β\1β', text) | |
| text = re.sub(r' -- ', r' β ', text) | |
| return text.strip() | |
| def humanize_text(text): | |
| dynamic_entities = extract_proper_nouns(text) | |
| protected_tokens = protect_proper_nouns(text, dynamic_entities) | |
| tokens = [t[0] for t in protected_tokens] | |
| tags = pos_tag(tokens) | |
| new_tokens = replace_words_with_synonyms(protected_tokens, tags) | |
| text = detokenizer.detokenize(new_tokens) | |
| return restructure_sentences(text) | |
| def process_text(input_text): | |
| if input_text: | |
| humanized = humanize_text(input_text) | |
| spell_checked = contextual_spell_check(humanized) | |
| final_output = finalize_formatting(spell_checked) | |
| return final_output | |
| return "" | |
| # Gradio Interface | |
| iface = gr.Interface( | |
| fn=process_text, | |
| inputs=gr.Textbox(lines=5, placeholder="Enter text here..."), | |
| outputs=gr.Textbox(lines=5), | |
| title="AI to Humanized Text Converter", | |
| description="Enter text and get a more humanized, readable output.", | |
| ) | |
| iface.launch() | |