-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscaymain.py
184 lines (149 loc) · 6.81 KB
/
scaymain.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
# -*- coding: utf-8 -*-
"""
Created on Fri Jan 25 18:38:42 2019
@author: The Prince
"""
#%% (1) Load relevant datasets (cholera, healthsite, population)
import pandas as pd
from scayfunctions import *
# Load national cholera data
# url = "https://docs.google.com/spreadsheets/d/1P0ob0sfz3xqG8u_dxT98YcVTMwzPSnya_qx6MbX-_Z8/pub?gid=27806487&single=true&output=csv"
# Load (subnational) governorate-level cholera data
url1 = 'https://docs.google.com/spreadsheets/d/1P0ob0sfz3xqG8u_dxT98YcVTMwzPSnya_qx6MbX-_Z8/pub?gid=0&single=true&output=csv'
cholera = load_dataset(url1)
# Remove commas and convert case data to numeric
cholera['Cases'] = cholera['Cases'].str.replace(',', '')
cholera['Cases'] = pd.to_numeric(cholera['Cases'])
# Drop unnecessary columns
cholera.drop(columns=['COD Gov English','COD Gov Arabic','COD Gov Pcode'],inplace=True)
# Correct governorate-level data in cholera dataset
clean_governorate_dataset(cholera)
# Load healthsite data
url2 = 'http://data.humdata.org/dataset/8ed6967a-13c0-4b57-9873-563970a1a35f/resource/af952140-3b69-407c-9d59-a1dab2cca008/download/yemen.csv'
healthsite = load_dataset(url2)
# Load population data
url3 = 'http://data.humdata.org/dataset/8ded1878-b737-4922-adf4-05216bd46674/resource/88590116-ea8f-4daf-ae32-3db8a015a319/download/yem_pop_adm1.csv'
population = load_dataset(url3)
# Retain only governorate name and total population
population = population.filter(items=['admin1Name_en','T'])
population.rename(columns={'admin1Name_en':'Governorate','T':'Population'},inplace=True)
#%% (2) Geospatial visualization
import geopandas as gpd
import matplotlib.pyplot as plt
# Load shapefile
shp = gpd.read_file('shapefiles/governorate.shp')
# Count number of healthsites in each governorate
count_healthsites(shp,healthsite)
# Aggregate geographic, cholera, population and healthsite data
overall = combine_datasets(cholera,population,shp)
# Plot (separate) geographic maps of Yemen illustrating relevant statistics
plot_maps(overall)
#%% (3) Principal components analysis (PCA) of Yemen Governorate data
# Done to determine best clusters of governorates on the basis of several features:
# Shape_Len, Shape_Area, Healthsites, Cases, Deaths, CFR (%), Attack Rate, Population
import numpy as np
import seaborn as sns
# Set grid style using seaborn
sns.set_style("darkgrid")
# Remove geometry column
X = overall.drop(columns=['geometry'])
# Plot pairwise relationships of governorate data
g = sns.pairplot(X)
g.fig.suptitle('Pairwise Relationships of Governorate Data')
fig.subplots_adjust(top=.9,bottom=.1)
import warnings
warnings.filterwarnings("ignore")
# Standardize data prior to PCA
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
Xsc = sc.fit_transform(X)
# Standard PCA
from sklearn.decomposition import PCA
pca = PCA(n_components = 2)
Xpca = pca.fit_transform(Xsc)
# Output percentage of total variance explained through first two PCs
print(pca.explained_variance_ratio_.sum())
# K-means clustering of standard PCA
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=5, random_state = 0)
labels = kmeans.fit_predict(Xpca)
# Plot clusters along with map representations
plot_pca(Xpca,overall,labels,'Standard PCA')
# Kernel PCA
from sklearn.decomposition import KernelPCA
kpca = KernelPCA(n_components = 2, kernel = 'rbf')
Xkpca = kpca.fit_transform(Xsc)
# K-means clustering of Kernel PCA
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=6, random_state = 0)
labels = kmeans.fit_predict(Xkpca)
# Plot clusters along with map representations
plot_pca(Xkpca,overall,labels,'Kernel PCA')
#%% (4) Time series analysis & forecasting (National data)
# Load national cholera dataset
url4 = 'https://docs.google.com/spreadsheets/d/1P0ob0sfz3xqG8u_dxT98YcVTMwzPSnya_qx6MbX-_Z8/pub?gid=27806487&single=true&output=csv'
country = load_dataset(url4)
country = country.drop(country.index[0])
country[country.columns[1:5]] = country[country.columns[1:5]].apply(pd.to_numeric,errors='coerce')
# Resample national data and train ARIMA model
z = extract_weekly_data(country)
pred,_,_ = train_ARIMA_model(z,'Yemen',False,country.iloc[0,0],0.7,4)
# Plot results related to national data
import matplotlib.dates as mdates
fig, ax = plt.subplots(figsize=(16,16))
ax.plot(z,label='Actual')
ax.plot(pred,label='Predictions')
ax.axvline(pd.to_datetime(country.iloc[0,0]),color='k',ls=':')
ax.set_title('Actual Cases vs ARIMA Predictions (National)',fontsize=20)
ax.set_xlabel('Date')
ax.set_ylabel('Cases')
ax.xaxis.set_major_locator(mdates.YearLocator())
ax.xaxis.set_minor_locator(mdates.MonthLocator((3,6,9,12)))
ax.xaxis.set_major_formatter(mdates.DateFormatter("\n%Y"))
ax.xaxis.set_minor_formatter(mdates.DateFormatter("%b"))
ax.legend()
fig.subplots_adjust(top=.9,bottom=.1)
#%% (5) Time series analysis & forecasting (Subnational data)
import warnings
warnings.filterwarnings("ignore")
# Extract names of all governorates
regions = cholera['Governorate'].cat.categories.tolist()
results = pd.DataFrame(columns=['Region','Values','Cases'])
# Iterate through all governorates
for name in regions:
tmp1 = pd.DataFrame(columns=['Region','Values','Cases'])
tmp2 = pd.DataFrame(columns=['Region','Values','Cases'])
# Load specific subnational data
region = cholera[cholera['Governorate'] == name]
# Extract (average) weekly subnational data
y = extract_weekly_data(region,name,False)
# Train ARIMA model
pred,_,_ = train_ARIMA_model(y,name,False,region.iloc[0,0],0.7)
# Create dummy dataframes to store data
tmp1['Cases'] = y
tmp1['Region'] = name
tmp1['Values'] = 'Actual'
tmp2['Cases'] = pred
tmp2['Region'] = name
tmp2['Values'] = 'Predicted'
# Concatenate to overall results dataframe
results = pd.concat([results, tmp1, tmp2])
results['Date'] = results.index
#train_perc = [0.7, 0.7, 0.5, 0.7, 0.8, 0.8, 0.7, 0.7, 0.7, 0.7, 0.7, 0.5, 0.8,
# 0.4, 0.4, 0.7, 0.4, 0.7, 0.7, 0.7, 0.5, 0.7]
# Plot results related to subnational data
import seaborn as sns
import matplotlib.dates as mdates
def plot_timestamp(data,**kwargs):
plt.axvline(pd.to_datetime('2018-02-18'),color='k',ls=':')
g = sns.FacetGrid(results,col='Region',col_wrap=5,hue='Values',sharey=False)
g = g.map(plot_timestamp,'Date')
for i in range(len(g.axes)):
g.axes[i].xaxis.set_major_locator(mdates.YearLocator())
g.axes[i].xaxis.set_minor_locator(mdates.MonthLocator((3,6,9,12)))
g.axes[i].xaxis.set_major_formatter(mdates.DateFormatter("\n%Y"))
g.axes[i].xaxis.set_minor_formatter(mdates.DateFormatter("%b"))
g = g.map(sns.lineplot,'Date','Cases')
g.add_legend()
g.fig.suptitle('Actual Cases vs ARIMA Predictions (Subnational)',fontsize=20)
g.fig.subplots_adjust(top=.9,bottom=.1)