-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathExplore Chicago Bikeshare Data
342 lines (282 loc) · 11.1 KB
/
Explore Chicago Bikeshare Data
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
# coding: utf-8
# Here goes the imports
import csv
import matplotlib.pyplot as plt
# Let's read the data as a list
print("Reading the document...")
with open("chicago.csv", "r") as file_read:
reader = csv.reader(file_read)
data_list = list(reader)
print("Ok!")
# Let's check how many rows do we have
print("Number of rows:")
print(len(data_list))
# Printing the first row of data_list to check if it worked.
print("Row 0: ")
print(data_list[0])
# It's the data header, so we can identify the columns.
# Printing the second row of data_list, it should contain some data
print("Row 1: ")
print(data_list[1])
input("Press Enter to continue...")
# TASK 1
# TODO: Print the first 20 rows using a loop to identify the data.
print("\n\nTASK 1: Printing the first 20 samples")
i=0
while (i <=20):
print(data_list[i])
i+=1
#Let's change the data_list to remove the header from it.
data_list = data_list[1:]
# We can access the features through index
# E.g. sample[6] to print gender or sample[-2]
input("Press Enter to continue...")
# TASK 2
# TODO: Print the `gender` of the first 20 rows
print("\nTASK 2: Printing the genders of the first 20 samples")
i=0
while (i <=20):
print(data_list[i][-2])
i+=1
# Cool! We can get the rows(samples) iterating with a for and the columns(features) by index.
# But it's still hard to get a column in a list. Example: List with all genders
input("Press Enter to continue...")
# TASK 3
# TODO: Create a function to add the columns(features) of a list in another list in the same order
def column_to_list(data, index):
'''
Function returns a single column from the csv file.
It iterates through lists(data) and gets one peace of information out of each (according to the index)
data= any list
index= int nunmber
'''
column_list = []
for row in data:
column_list.append(row[index])
# Tip: You can use a for to iterate over the samples, get the feature by index and append into a list
# Removing element 0 'Gender'
return column_list
# Let's check with the genders if it's working (only the first 20)
print("\nTASK 3: Printing the list of genders of the first 20 samples")
print(column_to_list(data_list, -2)[:20])
# ------------ DO NOT CHANGE ANY CODE HERE ------------
assert type(column_to_list(data_list, -2)) is list, "TASK 3: Wrong type returned. It should return a list."
assert len(column_to_list(data_list, -2)) == 1551505, "TASK 3: Wrong lenght returned."
assert column_to_list(data_list, -2)[0] == "" and column_to_list(data_list, -2)[1] == "Male", "TASK 3: The list doesn't match."
# -----------------------------------------------------
input("Press Enter to continue...")
# Now we know how to access the features, let's count how many Males and Females the dataset have
# TASK 4
# TODO: Count each gender. You should not use a function to do that.
male = 0
female = 0
gender_list=column_to_list(data_list, -2)
for gender in gender_list:
if gender == 'Male':
male += 1
elif gender == 'Female':
female += 1
else:
continue
# Checking the result
print("\nTASK 4: Printing how many males and females we found")
print("Male: ", male, "\nFemale: ", female)
# ------------ DO NOT CHANGE ANY CODE HERE ------------
assert male == 935854 and female == 298784, "TASK 4: Count doesn't match."
# -----------------------------------------------------
input("Press Enter to continue...")
# Why don't we creeate a function to do that?
# TASK 5
# TODO: Create a function to count the genders. Return a list
# Should return a list with [count_male, counf_female] (e.g., [10, 15] means 10 Males, 15 Females)
def count_gender(data_list):
#function counts ocurrences of the strings 'Male' and 'Female'
male = 0
female = 0
gender_data_list=column_to_list(data_list, -2)
for gender in gender_data_list:
if gender == 'Male':
male += 1
elif gender == 'Female':
female += 1
else:
continue
return [male, female]
print("\nTASK 5: Printing result of count_gender")
print(count_gender(data_list))
# ------------ DO NOT CHANGE ANY CODE HERE ------------
assert type(count_gender(data_list)) is list, "TASK 5: Wrong type returned. It should return a list."
assert len(count_gender(data_list)) == 2, "TASK 5: Wrong lenght returned."
assert count_gender(data_list)[0] == 935854 and count_gender(data_list)[1] == 298784, "TASK 5: Returning wrong result!"
# -----------------------------------------------------
input("Press Enter to continue...")
# Now we can count the users, which gender use it the most?
# TASK 6
# TODO: Create a function to get the most popular gender and print the gender as string.
# We expect to see "Male", "Female" or "Equal" as answer.
def most_popular_gender(data_list):
#Function uses count_gender function and then compares it results.
count_gender(data_list)
if male > female:
answer = "Male"
elif female > male:
answer = "Female"
else:
answer = "Equal"
return answer
print("\nTASK 6: Which one is the most popular gender?")
print("Most popular gender is: ", most_popular_gender(data_list))
# ------------ DO NOT CHANGE ANY CODE HERE ------------
assert type(most_popular_gender(data_list)) is str, "TASK 6: Wrong type returned. It should return a string."
assert most_popular_gender(data_list) == "Male", "TASK 6: Returning wrong result!"
# -----------------------------------------------------
# If it's everything running as expected, check this graph!
gender_list = column_to_list(data_list, -2)
types = ["Male", "Female"]
quantity = count_gender(data_list)
y_pos = list(range(len(types)))
plt.bar(y_pos, quantity)
plt.ylabel('Quantity')
plt.xlabel('Gender')
plt.xticks(y_pos, types)
plt.title('Quantity by Gender')
plt.show(block=True)
input("Press Enter to continue...")
# TASK 7
# TODO: Plot a similar graph for user_types. Make sure the legend is correct.
print("\nTASK 7: Check the chart!")
def count_user_type(data_list):
#Creates a similar function to count_gender
subscriber = 0
customer = 0
dependent = 0
gender_data_list=column_to_list(data_list, -2)
for user_type in user_types_list:
if user_type == 'Subscriber':
subscriber += 1
elif user_type == 'Customer':
customer += 1
elif user_type == 'Dependent':
customer += 1
else:
continue
return [subscriber, customer, dependent]
user_types_list = column_to_list(data_list, -3)
types = ["Subscriber", "Customer",'Dependent']
quantity = count_user_type(data_list)
y_pos = list(range(len(types)))
plt.bar(y_pos, quantity)
plt.ylabel('Quantity')
plt.xlabel('User Type')
plt.xticks(y_pos, types)
plt.title('Quantity by User Type')
plt.show(block=True)
input("Press Enter to continue...")
# TASK 8
# TODO: Answer the following question
male, female = count_gender(data_list)
print("\nTASK 8: Why the following condition is False?")
print("male + female == len(data_list):", male + female == len(data_list))
answer = "Because len(data_list) also comprises the frequency of empty items, that means, when the gender wasn't inserted"
print("Answer:", answer)
print(male)
print(female)
# ------------ DO NOT CHANGE ANY CODE HERE ------------
assert answer != "Type your answer here.", "TASK 8: Write your own answer!"
# -----------------------------------------------------
input("Press Enter to continue...")
# TASK 9
# TODO: Find the Minimum, Maximum, Mean and Median trip duration.
# You should not use ready functions to do that, like max() or min().
# making the list int (all the items were strings)
trip_duration_list = column_to_list(data_list, 2)
i=0
for i in range(len(trip_duration_list)):
trip_duration_list[i]= int(trip_duration_list[i])
min_trip = 0.
i = 1
min_trip=trip_duration_list[0]
for i in range(len(trip_duration_list)):
if trip_duration_list[i] < min_trip:
min_trip = trip_duration_list[i]
else:
continue
max_trip = 0.
i = 1
max_trip=trip_duration_list[0]
for i in range(len(trip_duration_list)):
if trip_duration_list[i] > max_trip:
max_trip = trip_duration_list[i]
else:
continue
mean_trip = 0.
sum_mean_trip=0
for i in range(len(trip_duration_list)):
sum_mean_trip += trip_duration_list[i]
mean_trip= sum_mean_trip / len(trip_duration_list)
median_trip = 0.
median_trip_list= sorted(trip_duration_list)
if len(median_trip_list) % 2 == 0:
median_trip= (median_trip_list[len(median_trip_list)/2] + median_trip_list[len(median_trip_list)/2+ 1]) /2
else:
median_trip= median_trip_list[round(len(median_trip_list)/2)]
print("\nTASK 9: Printing the min, max, mean and median")
print("Min: ", min_trip, "Max: ", max_trip, "Mean: ", mean_trip, "Median: ", median_trip)
# ------------ DO NOT CHANGE ANY CODE HERE ------------
assert round(min_trip) == 60, "TASK 9: min_trip with wrong result!"
assert round(max_trip) == 86338, "TASK 9: max_trip with wrong result!"
assert round(mean_trip) == 940, "TASK 9: mean_trip with wrong result!"
assert round(median_trip) == 670, "TASK 9: median_trip with wrong result!"
# -----------------------------------------------------
input("Press Enter to continue...")
# TASK 10
# Gender is easy because usually only have a few options. How about start_stations? How many options does it have?
# TODO: Check types how many start_stations do we have using set()
user_types = set(column_to_list(data_list, 3))
print("\nTASK 10: Printing start stations:")
print(len(user_types))
print(user_types)
# ------------ DO NOT CHANGE ANY CODE HERE ------------
assert len(user_types) == 582, "TASK 10: Wrong len of start stations."
# -----------------------------------------------------
input("Press Enter to continue...")
# TASK 11
# Go back and make sure you documented your functions. Explain the input, output and what it do. Example:
# def new_function(param1: int, param2: str) -> list:
'''
Example function with annotations.
Args:
param1: The first parameter.
param2: The second parameter.
Returns:
List of X values
'''
# TASK 12 - Challenge! (Optional)
# TODO: Create a function to count user types without hardcoding the types
# so we can use this function with a different kind of data.
print("Will you face it?")
answer = "yes"
i=0
def count_items(column_list):
'''
unique_items takes the single elements of column_list with no repetition
the first 'for' adds the elements of unique_items to item_types
the second 'for' adds the number of occurrences of each element of item_types in the column list
to count_items.
'''
unique_items= set(column_list)
item_types = []
count_items = []
for unique_item in unique_items:
item_types.append(unique_item)
for i in range(len(item_types)):
count_items.append(column_list.count(item_types[i]))
return item_types, count_items
if answer == "yes":
# ------------ DO NOT CHANGE ANY CODE HERE ------------
column_list = column_to_list(data_list, -2)
types, counts = count_items(column_list)
print("\nTASK 11: Printing results for count_items()")
print("Types:", types, "Counts:", counts)
assert len(types) == 3, "TASK 11: There are 3 types of gender!"
assert sum(counts) == 1551505, "TASK 11: Returning wrong result!"