File size: 4,910 Bytes
dbd2242
 
 
 
0acf97d
dbd2242
 
 
f6a45a0
889a97e
 
 
 
 
dbd2242
 
0acf97d
ea97277
dbd2242
 
 
 
 
ea97277
dbd2242
 
 
 
 
ea97277
dbd2242
 
 
 
 
 
 
 
 
 
ea97277
dbd2242
 
 
 
 
 
ea97277
dbd2242
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
01fea63
dbd2242
01fea63
dbd2242
 
 
 
 
 
 
 
 
 
ea97277
dbd2242
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ea97277
dbd2242
 
 
 
 
 
 
ea97277
dbd2242
 
 
 
 
 
 
 
ea97277
dbd2242
 
 
 
 
 
 
ea97277
dbd2242
 
 
 
 
 
 
 
ea97277
72e60f9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import gradio as gr
from nltk import pos_tag, ne_chunk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.corpus import wordnet, brown
from spellchecker import SpellChecker
import re
import nltk
nltk.download('brown')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('wordnet')
# Preload resources
detokenizer = TreebankWordDetokenizer()
global_freq_dist = nltk.FreqDist(w.lower() for w in brown.words())

def extract_proper_nouns(text):
    tokens = word_tokenize(text)
    tagged = pos_tag(tokens)
    chunks = ne_chunk(tagged)
    proper_nouns = set()

    for chunk in chunks:
        if hasattr(chunk, 'label') and chunk.label() in ('PERSON', 'ORGANIZATION', 'GPE'):
            entity = " ".join(c[0] for c in chunk)
            proper_nouns.add(entity.lower())
    return proper_nouns

def protect_proper_nouns(text, dynamic_entities):
    tokens = word_tokenize(text)
    protected = []
    for token in tokens:
        lower_token = token.lower()
        if any(lower_token in entity for entity in dynamic_entities):
            protected.append((token, True))
        else:
            protected.append((token, False))
    return protected

def replace_words_with_synonyms(tokens_with_protection, pos_tags):
    new_tokens = []
    for (token, protected), (_, tag) in zip(tokens_with_protection, pos_tags):
        if protected or not token.isalpha():
            new_tokens.append(token)
            continue

        pos = None
        if tag.startswith('JJ'):
            pos = wordnet.ADJ
        elif tag.startswith('RB'):
            pos = wordnet.ADV
        elif tag.startswith('NN'):
            pos = wordnet.NOUN
        elif tag.startswith('VB'):
            pos = wordnet.VERB

        if pos:
            candidates = []
            for syn in wordnet.synsets(token, pos=pos):
                for lemma in syn.lemmas():
                    candidate = lemma.name().replace('_', ' ')
                    if candidate.lower() == token.lower() or ' ' in candidate:
                        continue
                    if len(candidate) > len(token) * 1.2:
                        continue
                    candidates.append(candidate)
            if candidates:
                best_candidate = max(candidates, 
                                     key=lambda x: global_freq_dist[x.lower()],
                                     default=token)
                new_tokens.append(best_candidate)
            else:
                new_tokens.append(token)
        else:
            new_tokens.append(token)
    return new_tokens

def restructure_sentences(text):
    sentences = sent_tokenize(text)
    restructured = []
    for sent in sentences:
        if len(sent.split()) > 25:
            parts = re.split(r'[,;]', sent)
            if len(parts) > 1:
                sent = parts[0] + '. ' + ' '.join(parts[1:])
        restructured.append(sent)
    return ' '.join(restructured)

def contextual_spell_check(text):
    protected = protect_proper_nouns(text, extract_proper_nouns(text))
    spell = SpellChecker()
    corrected = []
    for token, protected_flag in protected:
        if protected_flag or not token.isalpha():
            corrected.append(token)
            continue
        correction = spell.correction(token)
        if correction:
            if token[0].isupper():
                corrected.append(correction.capitalize())
            else:
                corrected.append(correction.lower())
        else:
            corrected.append(token)
    return detokenizer.detokenize(corrected)

def finalize_formatting(text):
    text = re.sub(r'\s+([.,!?;:])', r'\1', text)
    text = re.sub(r'([(])\s+', r'\1', text)
    text = re.sub(r'\s+([)])', r'\1', text)
    text = re.sub(r'\"(.*?)\"', r'“\1”', text)
    text = re.sub(r' -- ', r' — ', text)
    return text.strip()

def humanize_text(text):
    dynamic_entities = extract_proper_nouns(text)
    protected_tokens = protect_proper_nouns(text, dynamic_entities)
    tokens = [t[0] for t in protected_tokens]
    tags = pos_tag(tokens)
    new_tokens = replace_words_with_synonyms(protected_tokens, tags)
    text = detokenizer.detokenize(new_tokens)
    return restructure_sentences(text)

def process_text(input_text):
    if input_text:
        humanized = humanize_text(input_text)
        spell_checked = contextual_spell_check(humanized)
        final_output = finalize_formatting(spell_checked)
        return final_output
    return ""

# Gradio Interface
iface = gr.Interface(
    fn=process_text,
    inputs=gr.Textbox(lines=5, placeholder="Enter text here..."),
    outputs=gr.Textbox(lines=5),
    title="AI to Humanized Text Converter",
    description="Enter text and get a more humanized, readable output.",
)

iface.launch()