Spaces:
Running
Running
File size: 4,910 Bytes
dbd2242 0acf97d dbd2242 f6a45a0 889a97e dbd2242 0acf97d ea97277 dbd2242 ea97277 dbd2242 ea97277 dbd2242 ea97277 dbd2242 ea97277 dbd2242 01fea63 dbd2242 01fea63 dbd2242 ea97277 dbd2242 ea97277 dbd2242 ea97277 dbd2242 ea97277 dbd2242 ea97277 dbd2242 ea97277 72e60f9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
import gradio as gr
from nltk import pos_tag, ne_chunk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.corpus import wordnet, brown
from spellchecker import SpellChecker
import re
import nltk
nltk.download('brown')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('wordnet')
# Preload resources
detokenizer = TreebankWordDetokenizer()
global_freq_dist = nltk.FreqDist(w.lower() for w in brown.words())
def extract_proper_nouns(text):
tokens = word_tokenize(text)
tagged = pos_tag(tokens)
chunks = ne_chunk(tagged)
proper_nouns = set()
for chunk in chunks:
if hasattr(chunk, 'label') and chunk.label() in ('PERSON', 'ORGANIZATION', 'GPE'):
entity = " ".join(c[0] for c in chunk)
proper_nouns.add(entity.lower())
return proper_nouns
def protect_proper_nouns(text, dynamic_entities):
tokens = word_tokenize(text)
protected = []
for token in tokens:
lower_token = token.lower()
if any(lower_token in entity for entity in dynamic_entities):
protected.append((token, True))
else:
protected.append((token, False))
return protected
def replace_words_with_synonyms(tokens_with_protection, pos_tags):
new_tokens = []
for (token, protected), (_, tag) in zip(tokens_with_protection, pos_tags):
if protected or not token.isalpha():
new_tokens.append(token)
continue
pos = None
if tag.startswith('JJ'):
pos = wordnet.ADJ
elif tag.startswith('RB'):
pos = wordnet.ADV
elif tag.startswith('NN'):
pos = wordnet.NOUN
elif tag.startswith('VB'):
pos = wordnet.VERB
if pos:
candidates = []
for syn in wordnet.synsets(token, pos=pos):
for lemma in syn.lemmas():
candidate = lemma.name().replace('_', ' ')
if candidate.lower() == token.lower() or ' ' in candidate:
continue
if len(candidate) > len(token) * 1.2:
continue
candidates.append(candidate)
if candidates:
best_candidate = max(candidates,
key=lambda x: global_freq_dist[x.lower()],
default=token)
new_tokens.append(best_candidate)
else:
new_tokens.append(token)
else:
new_tokens.append(token)
return new_tokens
def restructure_sentences(text):
sentences = sent_tokenize(text)
restructured = []
for sent in sentences:
if len(sent.split()) > 25:
parts = re.split(r'[,;]', sent)
if len(parts) > 1:
sent = parts[0] + '. ' + ' '.join(parts[1:])
restructured.append(sent)
return ' '.join(restructured)
def contextual_spell_check(text):
protected = protect_proper_nouns(text, extract_proper_nouns(text))
spell = SpellChecker()
corrected = []
for token, protected_flag in protected:
if protected_flag or not token.isalpha():
corrected.append(token)
continue
correction = spell.correction(token)
if correction:
if token[0].isupper():
corrected.append(correction.capitalize())
else:
corrected.append(correction.lower())
else:
corrected.append(token)
return detokenizer.detokenize(corrected)
def finalize_formatting(text):
text = re.sub(r'\s+([.,!?;:])', r'\1', text)
text = re.sub(r'([(])\s+', r'\1', text)
text = re.sub(r'\s+([)])', r'\1', text)
text = re.sub(r'\"(.*?)\"', r'“\1”', text)
text = re.sub(r' -- ', r' — ', text)
return text.strip()
def humanize_text(text):
dynamic_entities = extract_proper_nouns(text)
protected_tokens = protect_proper_nouns(text, dynamic_entities)
tokens = [t[0] for t in protected_tokens]
tags = pos_tag(tokens)
new_tokens = replace_words_with_synonyms(protected_tokens, tags)
text = detokenizer.detokenize(new_tokens)
return restructure_sentences(text)
def process_text(input_text):
if input_text:
humanized = humanize_text(input_text)
spell_checked = contextual_spell_check(humanized)
final_output = finalize_formatting(spell_checked)
return final_output
return ""
# Gradio Interface
iface = gr.Interface(
fn=process_text,
inputs=gr.Textbox(lines=5, placeholder="Enter text here..."),
outputs=gr.Textbox(lines=5),
title="AI to Humanized Text Converter",
description="Enter text and get a more humanized, readable output.",
)
iface.launch()
|