MUSKAN17 commited on
Commit
33a85b2
·
verified ·
1 Parent(s): 3e2d36c
Files changed (1) hide show
  1. app.py +151 -151
app.py CHANGED
@@ -1,165 +1,165 @@
1
- import gradio as gr
2
- from nltk import pos_tag, ne_chunk
3
- from nltk.tokenize import word_tokenize, sent_tokenize
4
- from nltk.tokenize.treebank import TreebankWordDetokenizer
5
- from nltk.corpus import wordnet, brown
6
- from spellchecker import SpellChecker
7
- import re
8
- import nltk
9
 
10
- # Preload resources
11
- detokenizer = TreebankWordDetokenizer()
12
- global_freq_dist = nltk.FreqDist(w.lower() for w in brown.words())
13
 
14
- def extract_proper_nouns(text):
15
- """
16
- Extracts proper nouns such as PERSON, ORGANIZATION, and GPE from the given text.
17
- Returns a set of detected proper nouns in lowercase.
18
- """
19
- tokens = word_tokenize(text)
20
- tagged = pos_tag(tokens)
21
- chunks = ne_chunk(tagged)
22
- proper_nouns = set()
23
 
24
- for chunk in chunks:
25
- if hasattr(chunk, 'label') and chunk.label() in ('PERSON', 'ORGANIZATION', 'GPE'):
26
- entity = " ".join(c[0] for c in chunk)
27
- proper_nouns.add(entity.lower())
28
- return proper_nouns
29
 
30
- def protect_proper_nouns(text, dynamic_entities):
31
- """
32
- Identifies and marks proper nouns to prevent them from being altered.
33
- Returns a list of tuples (word, is_protected).
34
- """
35
- tokens = word_tokenize(text)
36
- protected = []
37
- for token in tokens:
38
- lower_token = token.lower()
39
- if any(lower_token in entity for entity in dynamic_entities):
40
- protected.append((token, True))
41
- else:
42
- protected.append((token, False))
43
- return protected
44
 
45
- def replace_words_with_synonyms(tokens_with_protection, pos_tags):
46
- """
47
- Replaces words with synonyms while maintaining readability and ensuring proper nouns remain unchanged.
48
- """
49
- new_tokens = []
50
- for (token, protected), (_, tag) in zip(tokens_with_protection, pos_tags):
51
- if protected or not token.isalpha():
52
- new_tokens.append(token)
53
- continue
54
 
55
- pos = None
56
- if tag.startswith('JJ'):
57
- pos = wordnet.ADJ
58
- elif tag.startswith('RB'):
59
- pos = wordnet.ADV
60
- elif tag.startswith('NN'):
61
- pos = wordnet.NOUN
62
- elif tag.startswith('VB'):
63
- pos = wordnet.VERB
64
 
65
- if pos:
66
- candidates = []
67
- for syn in wordnet.synsets(token, pos=pos):
68
- for lemma in syn.lemmas():
69
- candidate = lemma.name().replace('_', ' ')
70
- if candidate.lower() == token.lower() or ' ' in candidate:
71
- continue
72
- if len(candidate) > len(token) * 1.2:
73
- continue
74
- candidates.append(candidate)
75
- if candidates:
76
- best_candidate = max(candidates,
77
- key=lambda x: global_freq_dist[x.lower()],
78
- default=token)
79
- new_tokens.append(best_candidate)
80
- else:
81
- new_tokens.append(token)
82
- else:
83
- new_tokens.append(token)
84
- return new_tokens
85
 
86
- def restructure_sentences(text):
87
- """
88
- Splits long sentences for better readability.
89
- Uses punctuation-based splitting if necessary.
90
- """
91
- sentences = sent_tokenize(text)
92
- restructured = []
93
- for sent in sentences:
94
- if len(sent.split()) > 25:
95
- parts = re.split(r'[,;]', sent)
96
- if len(parts) > 1:
97
- sent = parts[0] + '. ' + ' '.join(parts[1:])
98
- restructured.append(sent)
99
- return ' '.join(restructured)
100
 
101
- def contextual_spell_check(text):
102
- """
103
- Performs spell checking while ensuring proper nouns remain unchanged.
104
- """
105
- protected = protect_proper_nouns(text, extract_proper_nouns(text))
106
- spell = SpellChecker()
107
- corrected = []
108
- for token, protected_flag in protected:
109
- if protected_flag or not token.isalpha():
110
- corrected.append(token)
111
- continue
112
- correction = spell.correction(token)
113
- if correction:
114
- if token[0].isupper():
115
- corrected.append(correction.capitalize())
116
- else:
117
- corrected.append(correction.lower())
118
- else:
119
- corrected.append(token)
120
- return detokenizer.detokenize(corrected)
121
 
122
- def finalize_formatting(text):
123
- """
124
- Cleans up text formatting including spaces before punctuation, quote styles, and em dashes.
125
- """
126
- text = re.sub(r'\s+([.,!?;:])', r'\1', text)
127
- text = re.sub(r'([(])\s+', r'\1', text)
128
- text = re.sub(r'\s+([)])', r'\1', text)
129
- text = re.sub(r'\"(.*?)\"', r'“\1”', text)
130
- text = re.sub(r' -- ', r' — ', text)
131
- return text.strip()
132
 
133
- def humanize_text(text):
134
- """
135
- Applies synonym replacement, sentence restructuring, and other enhancements to make text more natural.
136
- """
137
- dynamic_entities = extract_proper_nouns(text)
138
- protected_tokens = protect_proper_nouns(text, dynamic_entities)
139
- tokens = [t[0] for t in protected_tokens]
140
- tags = pos_tag(tokens)
141
- new_tokens = replace_words_with_synonyms(protected_tokens, tags)
142
- text = detokenizer.detokenize(new_tokens)
143
- return restructure_sentences(text)
144
 
145
- def process_text(input_text):
146
- """
147
- Processes input text to enhance readability by applying all transformations sequentially.
148
- """
149
- if input_text:
150
- humanized = humanize_text(input_text)
151
- spell_checked = contextual_spell_check(humanized)
152
- final_output = finalize_formatting(spell_checked)
153
- return final_output
154
- return ""
155
 
156
- # Gradio Interface
157
- iface = gr.Interface(
158
- fn=process_text,
159
- inputs=gr.Textbox(lines=5, placeholder="Enter text here..."),
160
- outputs=gr.Textbox(lines=5),
161
- title="AI to Humanized Text Converter",
162
- description="Enter text and get a more humanized, readable output.",
163
- )
164
 
165
- iface.launch()
 
1
+ # import gradio as gr
2
+ # from nltk import pos_tag, ne_chunk
3
+ # from nltk.tokenize import word_tokenize, sent_tokenize
4
+ # from nltk.tokenize.treebank import TreebankWordDetokenizer
5
+ # from nltk.corpus import wordnet, brown
6
+ # from spellchecker import SpellChecker
7
+ # import re
8
+ # import nltk
9
 
10
+ # # Preload resources
11
+ # detokenizer = TreebankWordDetokenizer()
12
+ # global_freq_dist = nltk.FreqDist(w.lower() for w in brown.words())
13
 
14
+ # def extract_proper_nouns(text):
15
+ # """
16
+ # Extracts proper nouns such as PERSON, ORGANIZATION, and GPE from the given text.
17
+ # Returns a set of detected proper nouns in lowercase.
18
+ # """
19
+ # tokens = word_tokenize(text)
20
+ # tagged = pos_tag(tokens)
21
+ # chunks = ne_chunk(tagged)
22
+ # proper_nouns = set()
23
 
24
+ # for chunk in chunks:
25
+ # if hasattr(chunk, 'label') and chunk.label() in ('PERSON', 'ORGANIZATION', 'GPE'):
26
+ # entity = " ".join(c[0] for c in chunk)
27
+ # proper_nouns.add(entity.lower())
28
+ # return proper_nouns
29
 
30
+ # def protect_proper_nouns(text, dynamic_entities):
31
+ # """
32
+ # Identifies and marks proper nouns to prevent them from being altered.
33
+ # Returns a list of tuples (word, is_protected).
34
+ # """
35
+ # tokens = word_tokenize(text)
36
+ # protected = []
37
+ # for token in tokens:
38
+ # lower_token = token.lower()
39
+ # if any(lower_token in entity for entity in dynamic_entities):
40
+ # protected.append((token, True))
41
+ # else:
42
+ # protected.append((token, False))
43
+ # return protected
44
 
45
+ # def replace_words_with_synonyms(tokens_with_protection, pos_tags):
46
+ # """
47
+ # Replaces words with synonyms while maintaining readability and ensuring proper nouns remain unchanged.
48
+ # """
49
+ # new_tokens = []
50
+ # for (token, protected), (_, tag) in zip(tokens_with_protection, pos_tags):
51
+ # if protected or not token.isalpha():
52
+ # new_tokens.append(token)
53
+ # continue
54
 
55
+ # pos = None
56
+ # if tag.startswith('JJ'):
57
+ # pos = wordnet.ADJ
58
+ # elif tag.startswith('RB'):
59
+ # pos = wordnet.ADV
60
+ # elif tag.startswith('NN'):
61
+ # pos = wordnet.NOUN
62
+ # elif tag.startswith('VB'):
63
+ # pos = wordnet.VERB
64
 
65
+ # if pos:
66
+ # candidates = []
67
+ # for syn in wordnet.synsets(token, pos=pos):
68
+ # for lemma in syn.lemmas():
69
+ # candidate = lemma.name().replace('_', ' ')
70
+ # if candidate.lower() == token.lower() or ' ' in candidate:
71
+ # continue
72
+ # if len(candidate) > len(token) * 1.2:
73
+ # continue
74
+ # candidates.append(candidate)
75
+ # if candidates:
76
+ # best_candidate = max(candidates,
77
+ # key=lambda x: global_freq_dist[x.lower()],
78
+ # default=token)
79
+ # new_tokens.append(best_candidate)
80
+ # else:
81
+ # new_tokens.append(token)
82
+ # else:
83
+ # new_tokens.append(token)
84
+ # return new_tokens
85
 
86
+ # def restructure_sentences(text):
87
+ # """
88
+ # Splits long sentences for better readability.
89
+ # Uses punctuation-based splitting if necessary.
90
+ # """
91
+ # sentences = sent_tokenize(text)
92
+ # restructured = []
93
+ # for sent in sentences:
94
+ # if len(sent.split()) > 25:
95
+ # parts = re.split(r'[,;]', sent)
96
+ # if len(parts) > 1:
97
+ # sent = parts[0] + '. ' + ' '.join(parts[1:])
98
+ # restructured.append(sent)
99
+ # return ' '.join(restructured)
100
 
101
+ # def contextual_spell_check(text):
102
+ # """
103
+ # Performs spell checking while ensuring proper nouns remain unchanged.
104
+ # """
105
+ # protected = protect_proper_nouns(text, extract_proper_nouns(text))
106
+ # spell = SpellChecker()
107
+ # corrected = []
108
+ # for token, protected_flag in protected:
109
+ # if protected_flag or not token.isalpha():
110
+ # corrected.append(token)
111
+ # continue
112
+ # correction = spell.correction(token)
113
+ # if correction:
114
+ # if token[0].isupper():
115
+ # corrected.append(correction.capitalize())
116
+ # else:
117
+ # corrected.append(correction.lower())
118
+ # else:
119
+ # corrected.append(token)
120
+ # return detokenizer.detokenize(corrected)
121
 
122
+ # def finalize_formatting(text):
123
+ # """
124
+ # Cleans up text formatting including spaces before punctuation, quote styles, and em dashes.
125
+ # """
126
+ # text = re.sub(r'\s+([.,!?;:])', r'\1', text)
127
+ # text = re.sub(r'([(])\s+', r'\1', text)
128
+ # text = re.sub(r'\s+([)])', r'\1', text)
129
+ # text = re.sub(r'\"(.*?)\"', r'“\1”', text)
130
+ # text = re.sub(r' -- ', r' — ', text)
131
+ # return text.strip()
132
 
133
+ # def humanize_text(text):
134
+ # """
135
+ # Applies synonym replacement, sentence restructuring, and other enhancements to make text more natural.
136
+ # """
137
+ # dynamic_entities = extract_proper_nouns(text)
138
+ # protected_tokens = protect_proper_nouns(text, dynamic_entities)
139
+ # tokens = [t[0] for t in protected_tokens]
140
+ # tags = pos_tag(tokens)
141
+ # new_tokens = replace_words_with_synonyms(protected_tokens, tags)
142
+ # text = detokenizer.detokenize(new_tokens)
143
+ # return restructure_sentences(text)
144
 
145
+ # def process_text(input_text):
146
+ # """
147
+ # Processes input text to enhance readability by applying all transformations sequentially.
148
+ # """
149
+ # if input_text:
150
+ # humanized = humanize_text(input_text)
151
+ # spell_checked = contextual_spell_check(humanized)
152
+ # final_output = finalize_formatting(spell_checked)
153
+ # return final_output
154
+ # return ""
155
 
156
+ # # Gradio Interface
157
+ # iface = gr.Interface(
158
+ # fn=process_text,
159
+ # inputs=gr.Textbox(lines=5, placeholder="Enter text here..."),
160
+ # outputs=gr.Textbox(lines=5),
161
+ # title="AI to Humanized Text Converter",
162
+ # description="Enter text and get a more humanized, readable output.",
163
+ # )
164
 
165
+ # iface.launch()