AI-To-Humanize / app.py
MUSKAN17's picture
update
889a97e verified
import gradio as gr
from nltk import pos_tag, ne_chunk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.corpus import wordnet, brown
from spellchecker import SpellChecker
import re
import nltk
nltk.download('brown')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('wordnet')
# Preload resources
detokenizer = TreebankWordDetokenizer()
global_freq_dist = nltk.FreqDist(w.lower() for w in brown.words())
def extract_proper_nouns(text):
tokens = word_tokenize(text)
tagged = pos_tag(tokens)
chunks = ne_chunk(tagged)
proper_nouns = set()
for chunk in chunks:
if hasattr(chunk, 'label') and chunk.label() in ('PERSON', 'ORGANIZATION', 'GPE'):
entity = " ".join(c[0] for c in chunk)
proper_nouns.add(entity.lower())
return proper_nouns
def protect_proper_nouns(text, dynamic_entities):
tokens = word_tokenize(text)
protected = []
for token in tokens:
lower_token = token.lower()
if any(lower_token in entity for entity in dynamic_entities):
protected.append((token, True))
else:
protected.append((token, False))
return protected
def replace_words_with_synonyms(tokens_with_protection, pos_tags):
new_tokens = []
for (token, protected), (_, tag) in zip(tokens_with_protection, pos_tags):
if protected or not token.isalpha():
new_tokens.append(token)
continue
pos = None
if tag.startswith('JJ'):
pos = wordnet.ADJ
elif tag.startswith('RB'):
pos = wordnet.ADV
elif tag.startswith('NN'):
pos = wordnet.NOUN
elif tag.startswith('VB'):
pos = wordnet.VERB
if pos:
candidates = []
for syn in wordnet.synsets(token, pos=pos):
for lemma in syn.lemmas():
candidate = lemma.name().replace('_', ' ')
if candidate.lower() == token.lower() or ' ' in candidate:
continue
if len(candidate) > len(token) * 1.2:
continue
candidates.append(candidate)
if candidates:
best_candidate = max(candidates,
key=lambda x: global_freq_dist[x.lower()],
default=token)
new_tokens.append(best_candidate)
else:
new_tokens.append(token)
else:
new_tokens.append(token)
return new_tokens
def restructure_sentences(text):
sentences = sent_tokenize(text)
restructured = []
for sent in sentences:
if len(sent.split()) > 25:
parts = re.split(r'[,;]', sent)
if len(parts) > 1:
sent = parts[0] + '. ' + ' '.join(parts[1:])
restructured.append(sent)
return ' '.join(restructured)
def contextual_spell_check(text):
protected = protect_proper_nouns(text, extract_proper_nouns(text))
spell = SpellChecker()
corrected = []
for token, protected_flag in protected:
if protected_flag or not token.isalpha():
corrected.append(token)
continue
correction = spell.correction(token)
if correction:
if token[0].isupper():
corrected.append(correction.capitalize())
else:
corrected.append(correction.lower())
else:
corrected.append(token)
return detokenizer.detokenize(corrected)
def finalize_formatting(text):
text = re.sub(r'\s+([.,!?;:])', r'\1', text)
text = re.sub(r'([(])\s+', r'\1', text)
text = re.sub(r'\s+([)])', r'\1', text)
text = re.sub(r'\"(.*?)\"', r'β€œ\1”', text)
text = re.sub(r' -- ', r' β€” ', text)
return text.strip()
def humanize_text(text):
dynamic_entities = extract_proper_nouns(text)
protected_tokens = protect_proper_nouns(text, dynamic_entities)
tokens = [t[0] for t in protected_tokens]
tags = pos_tag(tokens)
new_tokens = replace_words_with_synonyms(protected_tokens, tags)
text = detokenizer.detokenize(new_tokens)
return restructure_sentences(text)
def process_text(input_text):
if input_text:
humanized = humanize_text(input_text)
spell_checked = contextual_spell_check(humanized)
final_output = finalize_formatting(spell_checked)
return final_output
return ""
# Gradio Interface
iface = gr.Interface(
fn=process_text,
inputs=gr.Textbox(lines=5, placeholder="Enter text here..."),
outputs=gr.Textbox(lines=5),
title="AI to Humanized Text Converter",
description="Enter text and get a more humanized, readable output.",
)
iface.launch()