diff --git a/imdb_sentiment_analysis.py b/imdb_sentiment_analysis.py new file mode 100644 index 0000000..5c50661 --- /dev/null +++ b/imdb_sentiment_analysis.py @@ -0,0 +1,156 @@ +#!/usr/bin/env python +# coding: utf-8 + +# In[4]: + + +import pandas as pd +# Local directory +Reviewdata = pd.read_csv('train_data.csv') +#data taken from kaggle +Reviewdata.columns + + +# In[5]: + + +### Checking for the Distribution of Default ### +import matplotlib.pyplot as plt +get_ipython().run_line_magic('matplotlib', 'inline') +print('Percentage for default\n') +print(round(Reviewdata.type.value_counts(normalize=True)*100,2)) +round(Reviewdata.type.value_counts(normalize=True)*100,2).plot(kind='bar') +plt.title('Percentage Distributions by review type') +plt.show() + + +# In[6]: + + +# Apply first level cleaning +import re +import string + +#This function converts to lower-case, removes square bracket, removes numbers and punctuation +def text_clean_1(text): + text = text.lower() + text = re.sub('\[.*?\]', '', text) + text = re.sub('[%s]' % re.escape(string.punctuation), '', text) + text = re.sub('\w*\d\w*', '', text) + return text +cleaned1 = lambda x: text_clean_1(x) + + +# In[8]: + + +# Let's take a look at the updated text +Reviewdata['cleaned_description'] = pd.DataFrame(Reviewdata.review.apply(cleaned1)) +Reviewdata.head(10) + + +# In[9]: + + +# Apply a second round of cleaning +def text_clean_2(text): + text = re.sub('[‘’“”…]', '', text) + text = re.sub('\n', '', text) + return text + +cleaned2 = lambda x: text_clean_2(x) + +# Let's take a look at the updated text +Reviewdata['cleaned_description_new'] = pd.DataFrame(Reviewdata['cleaned_description'].apply(cleaned2)) +Reviewdata.head(10) + + +# In[12]: + + +Reviewdata.drop(columns = ['review'], inplace = True) +Reviewdata.head(4) + + +# In[14]: + + +from sklearn.model_selection import train_test_split +Independent_var = Reviewdata.cleaned_description_new +Dependent_var = Reviewdata.type +IV_train, IV_test, DV_train, DV_test = train_test_split(Independent_var, Dependent_var, test_size = 0.2, random_state = 225) +print('IV_train :', len(IV_train)) +print('IV_test  :', len(IV_test)) +print('DV_train :', len(DV_train)) +print('DV_test  :', len(DV_test)) + + +# In[15]: + + +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.linear_model import LogisticRegression + +tvec = TfidfVectorizer() + +clf2 = LogisticRegression(max_iter=1000) + + +# In[16]: + + +from sklearn.pipeline import Pipeline + + +model = Pipeline([('vectorizer',tvec),('classifier',clf2)]) + +model.fit(IV_train, DV_train) + + +# In[17]: + + +from sklearn.metrics import confusion_matrix + +predictions = model.predict(IV_test) + +confusion_matrix(predictions, DV_test) + + +# In[18]: + + +from sklearn.metrics import accuracy_score, precision_score, recall_score + +print("Accuracy : ", accuracy_score(predictions, DV_test)) +print("Precision : ", precision_score(predictions, DV_test, average = 'weighted')) +print("Recall : ", recall_score(predictions, DV_test, average = 'weighted')) + + +# In[31]: + + +ex=[input(("enter a string: "))] +n=model.predict(ex) +if(n==0): + print("negative") +elif (n==1): + print("positive") + + +# In[30]: + + +ex=[input(("enter a string: "))] +n=model.predict(ex) +if(n==0): + print("negative") +elif (n==1): + print("positive") + + +# In[ ]: + + + + diff --git a/sentiment_analysis_using_random_forest.py b/sentiment_analysis_using_random_forest.py new file mode 100644 index 0000000..85727fe --- /dev/null +++ b/sentiment_analysis_using_random_forest.py @@ -0,0 +1,145 @@ +#!/usr/bin/env python +# coding: utf-8 + +# In[37]: + + +import pandas as pd +# Local directory +Reviewdata = pd.read_csv('train_data.csv') +#data taken from kaggle + + +# In[38]: + + +# Apply first level cleaning +import re +import string + +#This function converts to lower-case, removes square bracket, removes numbers and punctuation +def text_clean_1(text): + text = text.lower() + text = re.sub('\[.*?\]', '', text) + text = re.sub('[%s]' % re.escape(string.punctuation), '', text) + text = re.sub('\w*\d\w*', '', text) + return text +cleaned1 = lambda x: text_clean_1(x) + + +# In[39]: + + +Reviewdata.columns + + +# In[40]: + + +Reviewdata['cleaned_description'] = pd.DataFrame(Reviewdata.review.apply(cleaned1)) +Reviewdata.head(5) + + +# In[41]: + + +# Apply a second round of cleaning +def text_clean_2(text): + text = re.sub('[‘’“”…]', '', text) + text = re.sub('\n', '', text) + return text + +cleaned2 = lambda x: text_clean_2(x) + +# Let's take a look at the updated text +Reviewdata['cleaned_description_new'] = pd.DataFrame(Reviewdata['cleaned_description'].apply(cleaned2)) +Reviewdata.head(5) + + +# In[42]: + + +#remove unnecessary columns +Reviewdata.drop(columns = ['review','cleaned_description'], inplace = True) +Reviewdata.head(4) + + +# In[43]: + + +from sklearn.model_selection import train_test_split +Independent_var = Reviewdata.cleaned_description_new +Dependent_var = Reviewdata.type +IV_train, IV_test, DV_train, DV_test = train_test_split(Independent_var, Dependent_var, test_size = 0.2, random_state = 225) + + +# In[44]: + + +#vectorizeing +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.ensemble import RandomForestClassifier + +tvec = TfidfVectorizer() + +clf2 = RandomForestClassifier() + + +# In[45]: + + +#using pipeline pass data to ran +from sklearn.pipeline import Pipeline + +model = Pipeline([('vectorizer',tvec),('classifier',clf2)]) + +model.fit(IV_train, DV_train) + + +# In[46]: + + +from sklearn.metrics import confusion_matrix + +predictions = model.predict(IV_test) + +confusion_matrix(predictions, DV_test) + + +# In[47]: + + +from sklearn.metrics import accuracy_score, precision_score, recall_score + +print("Accuracy : ", accuracy_score(predictions, DV_test)) +print("Precision : ", precision_score(predictions, DV_test, average = 'weighted')) +print("Recall : ", recall_score(predictions, DV_test, average = 'weighted')) + + +# In[51]: + + +ex=[input(("enter a string: "))] +data=model.predict(ex) +if(data==0): + print("negative review") +elif data==1: + print("positive review") + + +# In[52]: + + +ex=[input(("enter a string: "))] +data=model.predict(ex) +if(data==0): + print("negative review") +elif data==1: + print("positive review") + + +# In[ ]: + + + + diff --git a/train_data.rar b/train_data.rar new file mode 100644 index 0000000..dd3678e Binary files /dev/null and b/train_data.rar differ