Spaces:

MUSKAN17
/

AI-To-Humanize

Running

File size: 4,910 Bytes

import gradio as gr
from nltk import pos_tag, ne_chunk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.corpus import wordnet, brown
from spellchecker import SpellChecker
import re
import nltk
nltk.download('brown')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('wordnet')
# Preload resources
detokenizer = TreebankWordDetokenizer()
global_freq_dist = nltk.FreqDist(w.lower() for w in brown.words())

def extract_proper_nouns(text):
    tokens = word_tokenize(text)
    tagged = pos_tag(tokens)
    chunks = ne_chunk(tagged)
    proper_nouns = set()

    for chunk in chunks:
        if hasattr(chunk, 'label') and chunk.label() in ('PERSON', 'ORGANIZATION', 'GPE'):
            entity = " ".join(c[0] for c in chunk)
            proper_nouns.add(entity.lower())
    return proper_nouns

def protect_proper_nouns(text, dynamic_entities):
    tokens = word_tokenize(text)
    protected = []
    for token in tokens:
        lower_token = token.lower()
        if any(lower_token in entity for entity in dynamic_entities):
            protected.append((token, True))
        else:
            protected.append((token, False))
    return protected

def replace_words_with_synonyms(tokens_with_protection, pos_tags):
    new_tokens = []
    for (token, protected), (_, tag) in zip(tokens_with_protection, pos_tags):
        if protected or not token.isalpha():
            new_tokens.append(token)
            continue

        pos = None
        if tag.startswith('JJ'):
            pos = wordnet.ADJ
        elif tag.startswith('RB'):
            pos = wordnet.ADV
        elif tag.startswith('NN'):
            pos = wordnet.NOUN
        elif tag.startswith('VB'):
            pos = wordnet.VERB

        if pos:
            candidates = []
            for syn in wordnet.synsets(token, pos=pos):
                for lemma in syn.lemmas():
                    candidate = lemma.name().replace('_', ' ')
                    if candidate.lower() == token.lower() or ' ' in candidate:
                        continue
                    if len(candidate) > len(token) * 1.2:
                        continue
                    candidates.append(candidate)
            if candidates:
                best_candidate = max(candidates, 
                                     key=lambda x: global_freq_dist[x.lower()],
                                     default=token)
                new_tokens.append(best_candidate)
            else:
                new_tokens.append(token)
        else:
            new_tokens.append(token)
    return new_tokens

def restructure_sentences(text):
    sentences = sent_tokenize(text)
    restructured = []
    for sent in sentences:
        if len(sent.split()) > 25:
            parts = re.split(r'[,;]', sent)
            if len(parts) > 1:
                sent = parts[0] + '. ' + ' '.join(parts[1:])
        restructured.append(sent)
    return ' '.join(restructured)

def contextual_spell_check(text):
    protected = protect_proper_nouns(text, extract_proper_nouns(text))
    spell = SpellChecker()
    corrected = []
    for token, protected_flag in protected:
        if protected_flag or not token.isalpha():
            corrected.append(token)
            continue
        correction = spell.correction(token)
        if correction:
            if token[0].isupper():
                corrected.append(correction.capitalize())
            else:
                corrected.append(correction.lower())
        else:
            corrected.append(token)
    return detokenizer.detokenize(corrected)

def finalize_formatting(text):
    text = re.sub(r'\s+([.,!?;:])', r'\1', text)
    text = re.sub(r'([(])\s+', r'\1', text)
    text = re.sub(r'\s+([)])', r'\1', text)
    text = re.sub(r'\"(.*?)\"', r'“\1”', text)
    text = re.sub(r' -- ', r' — ', text)
    return text.strip()

def humanize_text(text):
    dynamic_entities = extract_proper_nouns(text)
    protected_tokens = protect_proper_nouns(text, dynamic_entities)
    tokens = [t[0] for t in protected_tokens]
    tags = pos_tag(tokens)
    new_tokens = replace_words_with_synonyms(protected_tokens, tags)
    text = detokenizer.detokenize(new_tokens)
    return restructure_sentences(text)

def process_text(input_text):
    if input_text:
        humanized = humanize_text(input_text)
        spell_checked = contextual_spell_check(humanized)
        final_output = finalize_formatting(spell_checked)
        return final_output
    return ""

# Gradio Interface
iface = gr.Interface(
    fn=process_text,
    inputs=gr.Textbox(lines=5, placeholder="Enter text here..."),
    outputs=gr.Textbox(lines=5),
    title="AI to Humanized Text Converter",
    description="Enter text and get a more humanized, readable output.",
)

iface.launch()