forked from smangham/spacewalks
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patheva_data_analysis.py
160 lines (121 loc) · 4.17 KB
/
eva_data_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import matplotlib.pyplot as plt
import pandas as pd
import sys
import re
# https://data.nasa.gov/resource/eva.json (with modifications)
def main(input_file, output_file, graph_file):
print("--START--")
eva_data = read_json_to_dataframe(input_file)
eva_data = add_crew_size_column(eva_data)
write_dataframe_to_csv(eva_data, output_file)
plot_cumulative_time_in_space(eva_data, graph_file)
print("--END--")
def read_json_to_dataframe(input_file):
"""
Read the data from a JSON file into a Pandas dataframe.
Clean the data by removing any incomplete rows and sort by date
Args:
input_file (str): The path to the JSON file.
Returns:
eva_df (pd.DataFrame): The cleaned and sorted data as a dataframe structure
"""
print(f'Reading JSON file {input_file}')
eva_df = pd.read_json(input_file, convert_dates=['date'])
eva_df['eva'] = eva_df['eva'].astype(float)
eva_df.dropna(axis=0, inplace=True)
eva_df.sort_values('date', inplace=True)
return eva_df
def write_dataframe_to_csv(df, output_file):
"""
Write the dataframe to a CSV file.
Args:
df (pd.DataFrame): The input dataframe.
output_file (str): The path to the output CSV file.
Returns:
None
"""
print(f'Saving to CSV file {output_file}')
df.to_csv(output_file, index=False)
def text_to_duration(duration):
"""
Convert a text format duration "HH:MM" to duration in hours
Args:
duration (str): The text format duration
Returns:
duration_hours (float): The duration in hours
"""
hours, minutes = duration.split(":")
duration_hours = int(hours) + int(minutes)/60
return duration_hours
def add_duration_hours_variable(df):
"""
Add duration in hours (duration_hours) variable to the dataset
Args:
df (pd.DataFrame): The input dataframe.
Returns:
df_copy (pd.DataFrame): A copy of df with the new duration_hours variable added
"""
df_copy = df.copy()
df_copy["duration_hours"] = df_copy["duration"].apply(
text_to_duration
)
return df_copy
def plot_cumulative_time_in_space(df, graph_file):
"""
Plot the cumulative time spent in space over years
Convert the duration column from strings to number of hours
Calculate cumulative sum of durations
Generate a plot of cumulative time spent in space over years and
save it to the specified location
Args:
df (pd.DataFrame): The input dataframe.
graph_file (str): The path to the output graph file.
Returns:
None
"""
print(f'Plotting cumulative spacewalk duration and saving to {graph_file}')
df = add_duration_hours_variable(df)
df['cumulative_time'] = df['duration_hours'].cumsum()
plt.plot(df.date, df.cumulative_time, 'ko-')
plt.xlabel('Year')
plt.ylabel('Total time spent in space to date (hours)')
plt.tight_layout()
plt.savefig(graph_file)
plt.show()
def calculate_crew_size(crew):
"""
Calculate the size of the crew for a single crew entry
Args:
crew (str): The text entry in the crew column containing a list of crew member names
Returns:
int: The crew size
"""
if crew.split() == []:
return None
else:
return len(re.split(r';', crew))-1
def add_crew_size_column(df):
"""
Add crew_size column to the dataset containing the value of the crew size
Args:
df (pd.DataFrame): The input data frame.
Returns:
df_copy (pd.DataFrame): A copy of df_ with the new crew_size variable added
"""
print('Adding crew size variable (crew_size) to dataset')
df_copy = df.copy()
df_copy["crew_size"] = df_copy["crew"].apply(
calculate_crew_size
)
return df_copy
if __name__ == "__main__":
if len(sys.argv) < 3:
input_file = 'data/eva-data.json'
output_file = 'results/eva-data.csv'
print(f'Using default input and output filenames')
else:
input_file = sys.argv[1]
output_file = sys.argv[2]
print('Using custom input and output filenames')
graph_file = 'results/cumulative_eva_graph.png'
main(input_file, output_file, graph_file)