-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathReview_Ranker.py
268 lines (201 loc) · 8.19 KB
/
Review_Ranker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
#!/usr/bin/env python
# coding: utf-8
###################################################################################
# Module Imports
# Review Scraping Modules
import selenium
from selenium.webdriver import Chrome, ChromeOptions
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException, WebDriverException
from selenium.webdriver.chrome.options import Options
import pandas as pd
import numpy as np
# Create Feature Modules
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import spacy
nlp = spacy.load("en_core_web_sm")
import re
# Predictor Modules
from sklearn.feature_extraction.text import TfidfVectorizer
#Ranker Module
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
# General
import warnings
warnings.filterwarnings('ignore')
####################################################################################
# Function Block
# Get Reviews
def get_review(user_url):
'''Extracts reviews from user given `flipkart` product page and returns a `pandas dataframe`.
Parameters
-----------
url: Product for which user wants to extract the review
pages: Number of Pages of reviews the user likes to extract.By default `get_review`
extracts any number of pages.
Example
-------
>>> df=get_review("https://www.flipkart.com/redmi-8-ruby-red-64-gb/p/itmef9ed5039fca6?pid=MOBFKPYDCVSCZBYR")'''
global product_name
pages = 4 # Scrapes 25 Pages By Default
# User entered url
url = user_url
if 'flipkart' in url:
review_url = url.replace('/p/', '/product-reviews/')
# Browser Options
options = Options()
options.add_argument("--headless")
options.add_argument('start-maximized')
# Driver essential to run automated chrome window
# No option because its in currdir
driver = webdriver.Chrome(options=options)
Review_Text, Review_Rating, Upvote, Downvote,Num_Photos = [], [], [], [], []
# Extracting 25 pages of review
for i in range(1, pages+1):
# Change web Page
ping = f'{review_url}&page={i}'
driver.execute_script('window.open("{}","_self");'.format(ping))
WebDriverWait(driver, 10).until(EC.staleness_of)
# Check Read More Buttons
read_more_btns = driver.find_elements_by_class_name('_1EPkIx')
# Click on all read more in the current page
for rm in read_more_btns:
driver.execute_script("return arguments[0].scrollIntoView();", rm)
driver.execute_script("window.scrollBy(0, -150);")
rm.click()
# Get the product name to save contents inside this folder
if i == 1:
product_name = driver.find_element_by_xpath(
"//div[@class='o9Xx3p _1_odLJ']").text
# Extracting contents
# col _390CkK _1gY8H-
for block in driver.find_elements_by_xpath("//div[@class='col _390CkK _1gY8H-']"):
Review_Text.append(block.find_element_by_xpath(
".//div[@class='qwjRop']").text)
Review_Rating.append(block.find_element_by_xpath(
".//div[@class='hGSR34 E_uFuv'or @class='hGSR34 _1x2VEC E_uFuv' or @class='hGSR34 _1nLEql E_uFuv']").text)
Upvote.append(block.find_element_by_xpath(
".//div[@class='_2ZibVB']").text)
Downvote.append(block.find_element_by_xpath(
".//div[@class='_2ZibVB _1FP7V7']").text)
Num_Photos.append(len(block.find_elements_by_xpath(
".//div[@class='_3Z21tn _2wWSCV']")))
# Creating df of reviews
df = pd.DataFrame(data=list(zip(Review_Text, Review_Rating, Upvote, Downvote, Num_Photos )), columns=[
'Review_Text', 'Review_Rating', 'Upvote', 'Downvote','Num_Photos '])
# Handling dtypes of Review_Rating,Upvote,Downvote
for i in ['Review_Rating', 'Upvote', 'Downvote','Num_Photos ']:
df[i] = df[i].astype("int")
# Return dataframe
return product_name,df
#==================================================================================##
# Create Features
# *******Sub Funtions********
# 1. Sentiment
def sentimental_score(sentence):
analyzer = SentimentIntensityAnalyzer()
vs = analyzer.polarity_scores(sentence)
score = vs['compound']
if score >= 0.5:
return 'pos'
elif (score > -0.5) and (score < 0.5):
return 'neu'
elif score <= -0.5:
return 'neg'
# 2. Target
def target(df):
df['h'] = np.round(df.Upvote/(df.Upvote+df.Downvote), 2)
return df
# 3. Drop Unwated Columns
def drop_cols(df):
drop = ["Sum_of_Up_Down", "Upvote", "Downvote"]
df = df.drop(drop, axis=1)
return df
# 4. Number of sentence
def num_sentence(text):
# return len(nltk.sent_tokenize(text))
doc = nlp(text)
return len(list(doc.sents))
# 8. Remove Emoji
def remove_emoji(text):
return text.encode('ascii', 'ignore').decode('ascii').strip()
# 9. Remove Punctuations
def remove_punctuations(text):
return re.sub('[^\w\s%,-.]', "", text).strip()
def pos_tag(text):
doc = nlp(text)
return ' '.join([token.pos_ for token in doc])
def Adj(text):
text_len = len(text.split())
adj_count = 0
for word in text.split():
if word == 'ADJ':
adj_count += 1
return np.round((adj_count/text_len)*100, 2)
#*************************************************************************************#
# *******Main Function*******
def features(df):
''' Creates the Feature set based which gave best TEST MAPE during Experimentation
[Review_Text, Review_Rating,Num_Sentence, h]
'''
# Filtering Reviews which has Sum of Upvote and Downvote which is greater than 10
df['Sum_of_Up_Down'] = df.Upvote-df.Downvote
df = df[df.Sum_of_Up_Down > 10]
# Adding New Sentiment Column by calling the function **sentimental_Score**
df['Sentiment'] = df.Review_Text.apply(sentimental_score)
# Creating target and dropping unwanted columns
df = target(df)
df = drop_cols(df)
# Creating Num_Sentence
df['Num_Sentence'] = df.Review_Text.apply(num_sentence)
#For Percentage of Adjective
df['POS'] = df.Review_Text.apply(pos_tag)
# Percentage of Adjective
df['Perc_Adj'] = df.POS.apply(Adj)
#Dropping POS after calculating Adj Percentage
df=df.drop("POS",axis=1)
# Handling Emoji in review_text
df['Review_Text'] = df.Review_Text.apply(remove_emoji)
# Remove Punctuations
df.Review_Text = df.Review_Text.apply(remove_punctuations)
#Handling Shorted Reviews
df=df[df.Review_Text.str.split().apply(len)>10]
# Apply Lemmatization for the review and remove stop words
df.Review_Text = df.Review_Text.apply(lambda text: " ".join(token.lemma_ for token in nlp(text)
if not token.is_stop))
return df
#=================================================================================#
# Creates Predictors
def predictor(df, n=0.01):
'''
PARAMETERS: takes in a df and min_df, returns X,y
Doc-frequency less than 1 percent will be removed by default..
Hyperparameter can be changed during function call
'''
tfidf = TfidfVectorizer(
token_pattern='(?ui)\\b\\w*[a-z]+\\w*\\b', min_df=n, stop_words="english")
Matrix = tfidf.fit_transform(df.Review_Text)
unigram = pd.DataFrame(Matrix.toarray(), columns=tfidf.get_feature_names())
df=df.select_dtypes(exclude=['object'])
main = unigram.join(df)
main = main.fillna(0)
X = main.drop('h', axis=1)
y = main.h
return X, y
#===============================================================================#
# Main Ranker Function
def rank(X,y):
'''
PARAMETERS: Takes in X,y and returns y_pred
Rank reviews based on the features created using RandomForestRegressor
Which gave the best accuracy during Experimentation
'''
#Random Forest Regressor
rf = RandomForestRegressor(n_estimators = 100, random_state = 0)
rf.fit(X,y)
# Predicting on test data
y_pred = rf.predict(X)
return y_pred