Add files via upload

sharadhirao · Jul 2, 2020 · e10a153 · e10a153
1 parent ddbbed6
commit e10a153
Show file tree

Hide file tree

Showing 3 changed files with 301 additions and 0 deletions.
diff --git a/imdb_sentiment_analysis.py b/imdb_sentiment_analysis.py
@@ -0,0 +1,156 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+# In[4]:
+
+
+import pandas as pd 
+# Local directory
+Reviewdata = pd.read_csv('train_data.csv')
+#data taken from kaggle
+Reviewdata.columns
+
+
+# In[5]:
+
+
+### Checking for the Distribution of Default ###
+import matplotlib.pyplot as plt
+get_ipython().run_line_magic('matplotlib', 'inline')
+print('Percentage for default\n')
+print(round(Reviewdata.type.value_counts(normalize=True)*100,2))
+round(Reviewdata.type.value_counts(normalize=True)*100,2).plot(kind='bar')
+plt.title('Percentage Distributions by review type')
+plt.show()
+
+
+# In[6]:
+
+
+# Apply first level cleaning
+import re
+import string
+
+#This function converts to lower-case, removes square bracket, removes numbers and punctuation
+def text_clean_1(text):
+    text = text.lower()
+    text = re.sub('\[.*?\]', '', text)
+    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
+    text = re.sub('\w*\d\w*', '', text)
+    return text
+cleaned1 = lambda x: text_clean_1(x)
+
+
+# In[8]:
+
+
+# Let's take a look at the updated text
+Reviewdata['cleaned_description'] = pd.DataFrame(Reviewdata.review.apply(cleaned1))
+Reviewdata.head(10)
+
+
+# In[9]:
+
+
+# Apply a second round of cleaning
+def text_clean_2(text):
+    text = re.sub('[‘’“”…]', '', text)
+    text = re.sub('\n', '', text)
+    return text
+
+cleaned2 = lambda x: text_clean_2(x)
+
+# Let's take a look at the updated text
+Reviewdata['cleaned_description_new'] = pd.DataFrame(Reviewdata['cleaned_description'].apply(cleaned2))
+Reviewdata.head(10)
+
+
+# In[12]:
+
+
+Reviewdata.drop(columns = ['review'], inplace = True)
+Reviewdata.head(4)
+
+
+# In[14]:
+
+
+from sklearn.model_selection import train_test_split
+Independent_var = Reviewdata.cleaned_description_new
+Dependent_var = Reviewdata.type
+IV_train, IV_test, DV_train, DV_test = train_test_split(Independent_var, Dependent_var, test_size = 0.2, random_state = 225)
+print('IV_train :', len(IV_train))
+print('IV_test  :', len(IV_test))
+print('DV_train :', len(DV_train))
+print('DV_test  :', len(DV_test))
+
+
+# In[15]:
+
+
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.linear_model import LogisticRegression
+
+tvec = TfidfVectorizer()
+
+clf2 = LogisticRegression(max_iter=1000) 
+
+
+# In[16]:
+
+
+from sklearn.pipeline import Pipeline
+
+
+model = Pipeline([('vectorizer',tvec),('classifier',clf2)])
+
+model.fit(IV_train, DV_train)
+
+
+# In[17]:
+
+
+from sklearn.metrics import confusion_matrix
+
+predictions = model.predict(IV_test)
+
+confusion_matrix(predictions, DV_test)
+
+
+# In[18]:
+
+
+from sklearn.metrics import accuracy_score, precision_score, recall_score
+
+print("Accuracy : ", accuracy_score(predictions, DV_test))
+print("Precision : ", precision_score(predictions, DV_test, average = 'weighted'))
+print("Recall : ", recall_score(predictions, DV_test, average = 'weighted'))
+
+
+# In[31]:
+
+
+ex=[input(("enter a string: "))]
+n=model.predict(ex)
+if(n==0):
+    print("negative")
+elif (n==1):
+    print("positive")
+
+
+# In[30]:
+
+
+ex=[input(("enter a string: "))]
+n=model.predict(ex)
+if(n==0):
+    print("negative")
+elif (n==1):
+    print("positive")
+
+
+# In[ ]:
+
+
+
+
diff --git a/sentiment_analysis_using_random_forest.py b/sentiment_analysis_using_random_forest.py
@@ -0,0 +1,145 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+# In[37]:
+
+
+import pandas as pd 
+# Local directory
+Reviewdata = pd.read_csv('train_data.csv')
+#data taken from kaggle
+
+
+# In[38]:
+
+
+# Apply first level cleaning
+import re
+import string
+
+#This function converts to lower-case, removes square bracket, removes numbers and punctuation
+def text_clean_1(text):
+    text = text.lower()
+    text = re.sub('\[.*?\]', '', text)
+    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
+    text = re.sub('\w*\d\w*', '', text)
+    return text
+cleaned1 = lambda x: text_clean_1(x)
+
+
+# In[39]:
+
+
+Reviewdata.columns
+
+
+# In[40]:
+
+
+Reviewdata['cleaned_description'] = pd.DataFrame(Reviewdata.review.apply(cleaned1))
+Reviewdata.head(5)
+
+
+# In[41]:
+
+
+# Apply a second round of cleaning
+def text_clean_2(text):
+    text = re.sub('[‘’“”…]', '', text)
+    text = re.sub('\n', '', text)
+    return text
+
+cleaned2 = lambda x: text_clean_2(x)
+
+# Let's take a look at the updated text
+Reviewdata['cleaned_description_new'] = pd.DataFrame(Reviewdata['cleaned_description'].apply(cleaned2))
+Reviewdata.head(5)
+
+
+# In[42]:
+
+
+#remove unnecessary columns
+Reviewdata.drop(columns = ['review','cleaned_description'], inplace = True)
+Reviewdata.head(4)
+
+
+# In[43]:
+
+
+from sklearn.model_selection import train_test_split
+Independent_var = Reviewdata.cleaned_description_new
+Dependent_var = Reviewdata.type
+IV_train, IV_test, DV_train, DV_test = train_test_split(Independent_var, Dependent_var, test_size = 0.2, random_state = 225)
+
+
+# In[44]:
+
+
+#vectorizeing
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.ensemble import RandomForestClassifier
+
+tvec = TfidfVectorizer()
+
+clf2 = RandomForestClassifier()
+
+
+# In[45]:
+
+
+#using pipeline pass data to ran
+from sklearn.pipeline import Pipeline
+
+model = Pipeline([('vectorizer',tvec),('classifier',clf2)])
+
+model.fit(IV_train, DV_train)
+
+
+# In[46]:
+
+
+from sklearn.metrics import confusion_matrix
+
+predictions = model.predict(IV_test)
+
+confusion_matrix(predictions, DV_test)
+
+
+# In[47]:
+
+
+from sklearn.metrics import accuracy_score, precision_score, recall_score
+
+print("Accuracy : ", accuracy_score(predictions, DV_test))
+print("Precision : ", precision_score(predictions, DV_test, average = 'weighted'))
+print("Recall : ", recall_score(predictions, DV_test, average = 'weighted'))
+
+
+# In[51]:
+
+
+ex=[input(("enter a string: "))]
+data=model.predict(ex)
+if(data==0):
+    print("negative review")
+elif data==1:
+    print("positive review")
+
+
+# In[52]:
+
+
+ex=[input(("enter a string: "))]
+data=model.predict(ex)
+if(data==0):
+    print("negative review")
+elif data==1:
+    print("positive review")
+
+
+# In[ ]:
+
+
+
+
diff --git a/train_data.rar b/train_data.rar