Spaces:

neel692
/

Abusive-Comment-Detection

Sleeping

App Files Files Community

NeelTA commited on May 20, 2023

Commit

f3d8098

1 Parent(s): 0c01d2e

new repo

Browse files

Files changed (8) hide show

__pycache__/clean.cpython-39.pyc +0 -0
__pycache__/language_detection.cpython-39.pyc +0 -0
app.py +52 -0
clean.py +23 -0
language_detection.py +246 -0
model_joblib.pkl +3 -0
requirements.txt +4 -0
tf_joblib.pkl +3 -0

__pycache__/clean.cpython-39.pyc ADDED Viewed

Binary file (1.12 kB). View file

__pycache__/language_detection.cpython-39.pyc ADDED Viewed

Binary file (2.3 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import gradio as gr
+from gradio.components import Text
+import joblib
+import clean
+import numpy as np
+import language_detection
+print("all imports worked")
+# Load pre-trained model
+model = joblib.load('model_joblib.pkl')
+print("model load ")
+tf = joblib.load('tf_joblib.pkl')
+print("tfidf load ")
+# Define function to predict whether sentence is abusive or not
+def predict_abusive_lang(text):
+    print("original text ", text)
+    lang = language_detection.en_hi_detection(text)
+    print("language detected ", lang)
+    if lang=='eng':
+        cleaned_text = clean.text_cleaning(text)
+        print("cleaned text ", text)
+        text = tf.transform([cleaned_text])
+        print("tfidf transformation ", text)
+        prediction = model.predict(text)
+        print("prediction ", prediction)
+        if len(prediction)!=0 and prediction[0]==0:
+            return ["Not Abusive", cleaned_text]
+        elif len(prediction)!=0 and prediction[0]==1:
+            return ["Abusive",cleaned_text]
+        else :
+            return ["Please write something in the comment box..","No cleaned text"]
+    elif lang=='hi':
+        print("using hugging face api")
+        return ["Hindi Text abusive part coming soon.....","No cleaned text"]
+    else :
+        return ["Unknown language","No cleaned text"]
+# text = '":::::: 128514 - & % ! @ # $ % ^ & * ( ) _ + I got blocked for 30 minutes, you got blocked for more than days. You is lost.  www.google.com, #happydiwali, @amangupta And I don\'t even know who the fuck are you.  It\'s a zero! \n"'
+# predict_abusive_lang(text)
+# Define the GRADIO output interfaces
+output_interfaces = [
+    gr.outputs.Textbox(label="Result"),
+    gr.outputs.Textbox(label="Cleaned text")
+]
+app = gr.Interface(predict_abusive_lang, inputs='text', outputs=output_interfaces, title="Abuse Classifier", description="Enter a sentence and the model will predict whether it is abusive or not.")
+#Start the GRADIO app
+app.launch()

clean.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from string import punctuation
+import re
+def text_cleaning(text):
+    # Remove URLs starting with http, https and www, as well as quotes
+    result = re.sub(r'http\S+|www\S+|\"', '', text)
+    # Split the text into a list of words
+    words = result.split()
+    # Remove mentions and hashtags
+    words = [word for word in words if not word.startswith(('@', '#'))]
+    # Remove leading/trailing punctuation, and individual punctuation marks
+    words = [word.strip(punctuation) for word in words if word not in punctuation]
+    filtered_list = [item for item in words if item != '']
+    # Remove words starting with digits
+    words = [word for word in filtered_list if not word[0].isdigit()]
+    # Convert all words to lowercase
+    words = [w.lower() for w in words]
+    return " ".join(words)

language_detection.py ADDED Viewed

	@@ -0,0 +1,246 @@

+import nltk
+from nltk.corpus import wordnet
+import re
+from nltk.stem import WordNetLemmatizer
+stop_words = ['i',
+ 'me',
+ 'my',
+ 'myself',
+ 'we',
+ 'our',
+ 'ours',
+ 'ourselves',
+ 'you',
+ "you're",
+ "you've",
+ "you'll",
+ "you'd",
+ 'your',
+ 'yours',
+ 'yourself',
+ 'yourselves',
+ 'he',
+ 'him',
+ 'his',
+ 'himself',
+ 'she',
+ "she's",
+ 'her',
+ 'hers',
+ 'herself',
+ 'it',
+ "it's",
+ 'its',
+ 'itself',
+ 'they',
+ 'them',
+ 'their',
+ 'theirs',
+ 'themselves',
+ 'what',
+ 'which',
+ 'who',
+ 'whom',
+ 'this',
+ 'that',
+ "that'll",
+ 'these',
+ 'those',
+ 'am',
+ 'is',
+ 'are',
+ 'was',
+ 'were',
+ 'be',
+ 'been',
+ 'being',
+ 'have',
+ 'has',
+ 'had',
+ 'having',
+ 'do',
+ 'does',
+ 'did',
+ 'doing',
+ 'a',
+ 'an',
+ 'the',
+ 'and',
+ 'but',
+ 'if',
+ 'or',
+ 'because',
+ 'as',
+ 'until',
+ 'while',
+ 'of',
+ 'at',
+ 'by',
+ 'for',
+ 'with',
+ 'about',
+ 'against',
+ 'between',
+ 'into',
+ 'through',
+ 'during',
+ 'before',
+ 'after',
+ 'above',
+ 'below',
+ 'to',
+ 'from',
+ 'up',
+ 'down',
+ 'in',
+ 'out',
+ 'on',
+ 'off',
+ 'over',
+ 'under',
+ 'again',
+ 'further',
+ 'then',
+ 'once',
+ 'here',
+ 'there',
+ 'when',
+ 'where',
+ 'why',
+ 'how',
+ 'all',
+ 'any',
+ 'both',
+ 'each',
+ 'few',
+ 'more',
+ 'most',
+ 'other',
+ 'some',
+ 'such',
+ 'no',
+ 'nor',
+ 'not',
+ 'only',
+ 'own',
+ 'same',
+ 'so',
+ 'than',
+ 'too',
+ 'very',
+ 's',
+ 't',
+ 'can',
+ 'will',
+ 'just',
+ 'don',
+ "don't",
+ 'should',
+ "should've",
+ 'now',
+ 'd',
+ 'll',
+ 'm',
+ 'o',
+ 're',
+ 've',
+ 'y',
+ 'ain',
+ 'aren',
+ "aren't",
+ 'couldn',
+ "couldn't",
+ 'didn',
+ "didn't",
+ 'doesn',
+ "doesn't",
+ 'hadn',
+ "hadn't",
+ 'hasn',
+ "hasn't",
+ 'haven',
+ "haven't",
+ 'isn',
+ "isn't",
+ 'ma',
+ 'mightn',
+ "mightn't",
+ 'mustn',
+ "mustn't",
+ 'needn',
+ "needn't",
+ 'shan',
+ "shan't",
+ 'shouldn',
+ "shouldn't",
+ 'wasn',
+ "wasn't",
+ 'weren',
+ "weren't",
+ 'won',
+ "won't",
+ 'wouldn',
+ "wouldn't"]
+# Create a lemmatizer object
+lemmatizer = WordNetLemmatizer()
+#from english_words import get_english_words_set
+#web2lowerset = get_english_words_set(['web2'], lower=True)
+# Define the Unicode range for Hindi letters
+HINDI_UNICODE_RANGE = (0x0900, 0x097F)
+# Function to check if a given character is a Hindi letter
+def is_hindi_letter(c):
+    return ord(c) >= HINDI_UNICODE_RANGE[0] and ord(c) <= HINDI_UNICODE_RANGE[1]
+# In[8]:
+def en_hi_detection(text):
+    text = re.sub(r'[^\w\s]', ' ', text)
+    words = text.lower().strip().split()
+    count_en = 0
+    # Lemmatize words for all POS
+    for word in words:
+        for pos in [wordnet.NOUN, wordnet.VERB, wordnet.ADJ, wordnet.ADV]:
+#         print(f"{word} ({pos}): {lemmatizer.lemmatize(word, pos)}")
+            lem_word = lemmatizer.lemmatize(word, pos)
+            if lem_word in nltk.corpus.wordnet.words():
+                count_en+=1
+                break
+            elif lem_word in stop_words:
+                count_en+=1
+                break
+    #print("total english words found :", count_en)
+    #print("length of sentence :", len(words))
+    #print(count_en/len(words)*100, "% english words found")
+    count = 0
+    # Check each word for Hindi letters and print the results
+    for word in words:
+        hindi_letters = []
+        for c in word:
+            if is_hindi_letter(c):
+                hindi_letters.append(c)
+        if hindi_letters:
+            #print(f"Word '{word}' contains Hindi letters: {' '.join(hindi_letters)}")
+            count+=1
+        else:
+            pass
+            #print(f"Word '{word}' does not contain any Hindi letters.")
+    #print(count/len(words)*100, "% Hindi words found")
+    if count_en/len(words)*100>75:
+        return "eng"
+    elif count/len(words)*100>75:
+        return "hi"
+    else :
+        return "unknown"

model_joblib.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6308a9d0d4eb28b3ea67bc20a2e200218a9ca2c12b2fc8e17027536d1147d20f
+size 318919

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+scikit-learn==1.0.2
+nltk==3.8.1
+joblib==1.0.1

tf_joblib.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e53104db442b78f814eab3c2d081f6fc06279a4bdec6cfaea81c8221447f5dd3
+size 1441403