covid_19.py

# -*- coding: utf-8 -*-
"""covid-19.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1Sq3btX1kKr1dgyKZ0IRuxUOwLrGnPK7w
"""

# Commented out IPython magic to ensure Python compatibility.
# Copyright (c) 2020 Qingpeng Li. All rights reserved.
# Author: qingpeng9802@gmail.com (Qingpeng Li).

# %tensorflow_version 2.x
import tensorflow as tf
import pandas as pd
import datetime
import matplotlib.pyplot as plt
import numpy as np
from tensorflow import keras

tsc_df = pd.read_csv('./time_series_covid19_confirmed_global.csv')

tsc_df.head(10)

curve_df = tsc_df.drop(['Lat','Long','Province/State'],axis=1)

curve_df.head(10)

uniq_curve_df = curve_df.groupby(by='Country/Region').sum()

# get row: Contry columns: Date
uniq_curve_df.columns

# get the top10 number of patients countries
curve_data_df = uniq_curve_df.sort_values(by='3/26/20',axis=0,ascending=False).head(10)
# then, transpose
# get row: Date columns: Country
curve_datat_df = curve_data_df.transpose()

curve_datat_df.plot()

curve_datat_df

# find the order of top10 country who reached first 500 or more number of patients
order = {}
for contry, date_vs_num in curve_datat_df.iteritems():
    #print(contry)
    for date_l, num_l in date_vs_num.iteritems():
        if num_l>=500:
            #print(date_l, num_l)
            order[contry] = (date_l, num_l)
            break
        else:
            continue
        break
order

def getCountry(country):
    return{
    'US':0,
    'China':1,
    'Italy':2,
    'Spain':3,
    'Germany':4,
    'France':5,
    'Iran':6,
    'United Kingdom':7,
    'Switzerland':8,
    'Korea, South':9}.get(country)

# the delay days of first find 500 patients
sorted_contry = sorted(order.items(), 
                       key=lambda x: datetime.datetime.strptime(x[1][0], '%m/%d/%y'))
order_contry_bias = {}
for i in sorted_contry:
    order_contry_bias[getCountry(i[0])] = (
        datetime.datetime.strptime(i[1][0], '%m/%d/%y') - # >500 Date
        datetime.datetime(2020, 1, 22, 0, 0, 0) # start Date
        ).days
order_contry_bias

"""0. US	
1. China	
2. Italy	
3. Spain	
4. Germany	
5. France	
6. Iran	
7. United Kingdom	
8. Switzerland	
9. Korea, South
"""

# Convert DataFrame to Numpy Arrary
contryarr = curve_data_df.to_numpy(dtype='float32')
contryarr

"""### Normalization Method 1  
$Ratio\\
=\dfrac{MaxNumberOfCurrData}{China'sData[CurrentDay-500Day]}\\
=\dfrac{MaxNumberOfCurrData}{China'sData[DiffDaysFrom500OfCurrData]}\\
=\dfrac{MaxNumberOfCurrData}{China'sNumberWithDiff}$  
  
$EstimatedMaxOfCurrData=Ratio\times China'sMax$  
  
$\textbf{Row}\leftarrow\textbf{Row}/EstimatedMaxOfCurrData$
"""

# Normalize the data of each contry
def normalizeMaxRow(row, index):

    currRefer = row[len(row)-1]
    chinaRefer = contryarr[1][len(row) - order_contry_bias[index]-1]

    ratio = currRefer/chinaRefer
    print(index, ratio)
    
    estimated_m = ratio*max(contryarr[1])
    print(index, estimated_m)

    return row/estimated_m

index = 0

nor_contryarr = np.zeros([1,len(uniq_curve_df.columns)], dtype='float32')
for row in contryarr:
    # use simple normailize China and Korea's Data
    # since the curve is long enough, not need estimate max
    if index in [1, 9]:
        print(index, 1)
        new_row = row/(np.max(row))
    else:
        new_row = normalizeMaxRow(row, index)
    nor_contryarr = np.append(nor_contryarr, np.asarray([new_row], dtype='float32'), axis=0)
    index+=1
nor_contryarr = np.delete(nor_contryarr, [0], axis=0)
#nor_contryarr

# TensorBoard
logdir = "logs/scalars/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
file_writer = tf.summary.create_file_writer(logdir + "/metrics")
file_writer.set_as_default()
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=logdir)

# Build model
tf.random.set_seed(1)
model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(32, activation='sigmoid'))
model.add(tf.keras.layers.Conv1D(filters=32, kernel_size=7, padding='same'))
model.add(tf.keras.layers.Dense(32, activation='sigmoid'))
model.add(tf.keras.layers.Conv1D(filters=32, kernel_size=7, padding='same'))
model.add(tf.keras.layers.Dense(32, activation='sigmoid'))
model.add(tf.keras.layers.Dropout(0.005))
model.add(tf.keras.layers.Dense(32, activation='sigmoid'))
model.add(tf.keras.layers.Dense(1))
#model.summary()

model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(0.001))

# Convert 2D array to 3D
def prepare_train_data(contry_nparray):
    temp = np.transpose(np.asarray([contry_nparray]))
    return temp.reshape((temp.shape[0], temp.shape[1], 1))

# prepare train data
train_y = prepare_train_data(nor_contryarr[1])
print(train_y.shape)

train_x = np.asarray([np.arange(len(curve_data_df.columns), dtype='float32')])
train_x = np.transpose(train_x)
train_x = train_x.reshape((train_x.shape[0], train_x.shape[1], 1))
print(train_x.shape)

# prepare validatation data (Korea's Data)
korea_y = prepare_train_data(nor_contryarr[9])
print(korea_y.shape)

history = model.fit(train_x,
          train_y,
          epochs=70,
          validation_data=(train_x, korea_y),
          callbacks=[tensorboard_callback],
          verbose=1)

# Commented out IPython magic to ensure Python compatibility.
# %load_ext tensorboard

# Commented out IPython magic to ensure Python compatibility.
# %tensorboard --logdir logs/scalars

# find the Day Bias such that the data of the country is closet to the model
def bestfit_day(contry_nparr):
    bestfit = {}
    for i in range(len(uniq_curve_df.columns)-1):
        bestfit[i]=model.evaluate(train_x[0:len(train_x)-i],
                                  contry_nparr[i:],
                                  verbose=0)
    key_min = min(bestfit.keys(), key=(lambda k: bestfit[k]))
    return key_min, bestfit[key_min]

def getCountryNum(num):
    return{
    0:'US',
    1:'China',
    2:'Italy',
    3:'Spain',
    4:'Germany',
    5:'France',
    6:'Iran',
    7:'United Kingdom',
    8:'Switzerland',
    9:'Korea, South'}.get(num)


bias_day = {}
totalmse = 0.0
index = 0
for contry in nor_contryarr:
    #print(contry)
    train_temp = prepare_train_data(contry)
    bias_day[getCountryNum(index)], mse = bestfit_day(train_temp)
    if index!=6:
        totalmse += mse
    index+=1

# The days that the curve needed to move left
print(bias_day)
print(totalmse)

#!rm -rf ./logs

pred = np.array([])
for date in train_x:
    date = np.asarray([[[date.item()]]])
    c = model.predict(date)
    pred = np.concatenate((pred, c[0][0]))

# The prediction curve by the model based on China's Data
pred = pred.flatten()

plt.rcParams.update({'font.size': 20})
fig, ax = plt.subplots()
fig.set_size_inches(30, 15)
plt.xlabel('Day')
plt.ylabel("Confirmed Number")

plt.plot(train_x.reshape(-1), pred, '-o', label="PredictionCurve w/ China", c='black')

index = 0
for c in ['US','China','Italy','Spain','Germany','France','Iran',
          'United Kingdom','Switzerland','Korea, South']:
    labelStr = c + " \n" + str(len(train_x)-bias_day[c]) + ' Days'
    plt.plot(train_x[0:len(train_x)-bias_day[c]].reshape(-1), prepare_train_data(nor_contryarr[index])[bias_day[c]:].reshape(-1), '-o', label=labelStr)
    plt.annotate(str(len(train_x)-bias_day[c]),
                 xy=(train_x[0:len(train_x)-bias_day[c]].reshape(-1)[-1],
                     prepare_train_data(nor_contryarr[index])[bias_day[c]:].reshape(-1)[-1]),
                  xytext=(5,0),
                  textcoords='offset points')
    index+=1

plt.legend()

"""### Figure Confirmed Number vs Day:  
According to China's Data and the prediction curve, the number of increasing patients is slowing down at about Day 35. This means that the US might need to take 35-18=17 Days as 3/26/20 to get slowing down curve. That is, the slowing down date is about 4/12/20, and the curve would be flat at about 4/17/20.

# Logistic Function Method
"""

# slice the time series of each country by the day of reaching >500 cases
ts_from500 = {}
for k, v in order_contry_bias.items():
    ts_from500[k] = (np.asarray([i for i in range(len(uniq_curve_df.columns)-v)]), contryarr[k][v:])
    print(k, len(ts_from500[k][0]), len(ts_from500[k][1]))

from scipy.optimize import curve_fit
def logistic_fun(x, x0, L, k):
    return L / (1.0 + np.exp(-k * (x - x0)))

# fit current data of each country with logistic function,
# and predict future curve by its own logistic function
pred_logi = {}
params = {}
for k, v in ts_from500.items():
    pred_temp = []
    popt, pcov = curve_fit(logistic_fun, ts_from500[k][0], ts_from500[k][1], bounds=([10.,9000.,0.02],[60.,1000000.,0.8]))
    print(popt)
    params[k] = popt
    for i in range(len(uniq_curve_df.columns)):
        y_pred = logistic_fun(i, popt[0], popt[1], popt[2])
        #print(y_pred)
        pred_temp.append(y_pred)
    pred_logi[k] = (range(len(uniq_curve_df.columns)), np.asarray(pred_temp))

#pred_logi

#params

plt.rcParams.update({'font.size': 16})
fig, ax = plt.subplots()
fig.set_size_inches(30, 15)
plt.xlabel('Day')
plt.ylabel("Confirmed Number")

index = 0
for k, v in pred_logi.items():
    plt.plot(v[0], v[1], '-o', label='_nolegend_')
    plt.annotate(str(len(v[0])),
                 xy=(v[0][-1], v[1][-1]),
                  xytext=(5,0),
                  textcoords='offset points')
    index+=1

def getCountryNum(num):
    return{
    0:'US',
    1:'China',
    2:'Italy',
    3:'Spain',
    4:'Germany',
    5:'France',
    6:'Iran',
    7:'United Kingdom',
    8:'Switzerland',
    9:'Korea, South'}.get(num)

index = 0
for k, v in ts_from500.items():
    labelStr = getCountryNum(k) + " \n" + str(len(v[0])) + ' Days\n' + 'Mid: ' + '{:.2f}'.format(params[k][0]) + ' Max: ' + '{:.0f}'.format(params[k][1]) + ' Lrate: ' + '{:.2f}'.format(params[k][2])
    
    plt.plot(v[0], v[1], '-o', label=labelStr)
    plt.annotate(str(len(v[0])),
                 xy=(v[0][-1], v[1][-1]),
                  xytext=(5,0),
                  textcoords='offset points')
    index+=1

plt.legend(loc='upper left')

"""## Normalization Method 2

$EstimatedMaxOfCurrData = MaxOfLogisticFunc$
  
$\textbf{Row}\leftarrow\textbf{Row}/EstimatedMaxOfCurrData$
"""

# Normalize the data of each contry
def normalizeMaxRowLogi(row, index):
    estimated_m = params[index][1]
    return row/estimated_m

index = 0

norlogi_contryarr = np.zeros([1,len(uniq_curve_df.columns)], dtype='float32')
for row in contryarr:
    new_row = normalizeMaxRowLogi(row, index)
    norlogi_contryarr = np.append(norlogi_contryarr, np.asarray([new_row], dtype='float32'), axis=0)
    index+=1
norlogi_contryarr = np.delete(norlogi_contryarr, [0], axis=0)
norlogi_contryarr

# TensorBoard
logdir = "logs/scalars/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
file_writer = tf.summary.create_file_writer(logdir + "/metrics")
file_writer.set_as_default()
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=logdir)

# Build model
tf.random.set_seed(1)
model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(32, activation='sigmoid'))
model.add(tf.keras.layers.Conv1D(filters=32, kernel_size=7, padding='same'))
model.add(tf.keras.layers.Dense(32, activation='sigmoid'))
model.add(tf.keras.layers.Conv1D(filters=32, kernel_size=7, padding='same'))
model.add(tf.keras.layers.Dense(32, activation='sigmoid'))
model.add(tf.keras.layers.Dropout(0.005))
model.add(tf.keras.layers.Dense(32, activation='sigmoid'))
model.add(tf.keras.layers.Dense(1))
#model.summary()

model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(0.001))

# prepare train data
train_y = prepare_train_data(norlogi_contryarr[1])
print(train_y.shape)

train_x = np.asarray([np.arange(len(curve_data_df.columns), dtype='float32')])
train_x = np.transpose(train_x)
train_x = train_x.reshape((train_x.shape[0], train_x.shape[1], 1))
print(train_x.shape)

# prepare validatation data (Korea's Data)
korea_y = prepare_train_data(norlogi_contryarr[9])
print(korea_y.shape)

history = model.fit(train_x,
          train_y,
          epochs=70,
          validation_data=(train_x, korea_y),
          callbacks=[tensorboard_callback],
          verbose=1)

bias_day = {}
totalmse = 0.0
index = 0
for contry in norlogi_contryarr:
    #print(contry)
    train_temp = prepare_train_data(contry)
    bias_day[getCountryNum(index)], mse = bestfit_day(train_temp)
    if index!=6:
        totalmse += mse
    index+=1

# The days that the curve needed to move left
print(bias_day)
print(totalmse)

pred_nor2 = np.array([])
for date in train_x:
    date = np.asarray([[[date.item()]]])
    c = model.predict(date)
    pred_nor2 = np.concatenate((pred_nor2, c[0][0]))

# The prediction curve by the model based on China's Data
pred_nor2 = pred_nor2.flatten()

plt.rcParams.update({'font.size': 20})
fig, ax = plt.subplots()
fig.set_size_inches(30, 15)
plt.xlabel('Day')
plt.ylabel("Confirmed Number")

plt.plot(train_x.reshape(-1), pred_nor2, '-o', label="PredictionCurve w/ China", c='black')

index = 0
for c in ['US','China','Italy','Spain','Germany','France','Iran',
          'United Kingdom','Switzerland','Korea, South']:
    labelStr = c + " \n" + str(len(train_x)-bias_day[c]) + ' Days'
    plt.plot(train_x[0:len(train_x)-bias_day[c]].reshape(-1), norlogi_contryarr[index][bias_day[c]:], '-o', label=labelStr)
    plt.annotate(str(len(train_x)-bias_day[c]),
                 xy=(train_x[0:len(train_x)-bias_day[c]].reshape(-1)[-1],
                     norlogi_contryarr[index][bias_day[c]:][-1]),
                  xytext=(5,0),
                  textcoords='offset points')
    index+=1

plt.legend()