-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path01_linear_regression.py
196 lines (147 loc) · 7.12 KB
/
01_linear_regression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
# Linear regression with Tensorflow
# This notebook will show how to make a linear regression with the help of the
# `Tensorflow` library. As an example, a set of OpenStreetMap elements gathered
# around Bordeaux will be used. The goal of the notebook is to predict the
# number of contributors for each element, starting from a set of other element
# characteristics.
# Step 0: module imports
# `matplotlib` will be used to plot regression results, `os` is necessary for
# relative path handling, then `pandas` is used to handle the input dataframe,
# and of course, `tensorflow` will be needed to do the regression.
import math
import matplotlib.pyplot as plt
import os
import pandas as pd
import tensorflow as tf
# Step 1: data recovering and preparation
# The used data describes a set of OpenStreetMap elements, we admit it is
# available on the computer
data = pd.read_csv("/home/rde/data/osm-history/output-extracts/bordeaux-metropole/element-metadata.csv", index_col=0)
data.shape
# We have 2760999 individuals in this table, described by 17 different
# features. One can provide a short extract of this dataset:
print( data.sample(2).T )
# In this study, we will consider the number of contributors as the output to
# predict. We select a small set of features as predictors: the number of days
# between first and last modifications, the number of days since first
# modification, the number of days during which modifications arised, the last
# version, the number of change sets and the numbers of autocorrections and
# corrections.
data_x = data[["lifespan", "n_inscription_days", "n_activity_days", "version", "n_chgset", "n_autocorr", "n_corr"]].values
data_y = data[["n_user"]].values
print(data_x.shape)
print(data_y.shape)
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data_x, data_y,
test_size=0.1)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)
# Step 2: Parameter settings
learning_rate = 0.01
training_epochs = 15000
display_step = 50
# Step 3: Prepare the checkpoint creation
def make_dir(path):
""" Create a directory if there isn't one already. """
try:
os.mkdir(path)
except OSError:
pass
make_dir('../checkpoints')
make_dir('../checkpoints/linreg_osm')
global_step = tf.Variable(0, dtype=tf.int32, trainable=False, name='global_step')
# Step 4: Tensorflow model design
# We need to tensors, *i.e.* one for inputs and one for outputs.
with tf.name_scope("data"):
X = tf.placeholder(tf.float32, name="X")
Y = tf.placeholder(tf.float32, name="Y")
# The linear regression is defined through weights and biases, that are set as
# tensorflow variables, and injected into the output variable `predictions`. A
# hidden layer is added to
with tf.name_scope("linear_reg"):
w = tf.get_variable('weights', [data_x.shape[1], 1], initializer=tf.truncated_normal_initializer())
b = tf.get_variable('biases', [1], initializer=tf.constant_initializer(0.0))
predictions = tf.add(tf.matmul(X, w), b)
# The cost function is the sum of squares of differences between predictions
# and true outputs. A regularization term is added to this value.
with tf.name_scope('loss'):
# loss function + regularization value
loss = tf.reduce_sum(tf.square(predictions - Y)) + 0.01 * tf.nn.l2_loss(w)
# We use Adam optimizer to update the model variable (other choices:
# GradientDescentOptimizer, AdadeltaOptimizer, AdagradOptimizer...)
with tf.name_scope('train'):
train_op = tf.train.AdamOptimizer(learning_rate).minimize(loss, global_step=global_step)
# Step 5: Variable initialization
init = tf.global_variables_initializer()
# Final step: running the model
# First we have to open a new session, initialize the variable, and prepare the
# graph and checkpoint utilities:
session = tf.Session()
session.run(init)
saver = tf.train.Saver()
writer = tf.summary.FileWriter('../graphs/linreg', session.graph)
##### You have to create folders to store checkpoints
ckpt = tf.train.get_checkpoint_state(os.path.dirname('../checkpoints/linreg_osm/checkpoint'))
# if that checkpoint exists, restore from checkpoint
if ckpt and ckpt.model_checkpoint_path:
saver.restore(session, ckpt.model_checkpoint_path)
initial_step = global_step.eval(session=session)
# The model is ready to be trained. We proceed to as many training steps as
# indicated by the previous parametrization.
costs = list()
weights = list()
biases = list()
print("### Training step ###")
for epoch in range(initial_step, training_epochs):
session.run(train_op, feed_dict={X: x_train, Y: y_train})
#Display logs per epoch step
if (epoch+1) % display_step == 0:
training_cost, weight, bias = session.run([loss, w, b], feed_dict={X: x_train, Y: y_train})
print("*** Epoch", '%04d' % (epoch+1), "cost={}\nn_user = {:.3f}*X1 + {:.3f}*X2 + {:.3f}*X3 + {:.3f}*X4 + {:.3f}*X5 + {:.3f}*X6 + {:.3f}*X7 + {:.3f} ***".format(training_cost, weight[0][0], weight[1][0], weight[2][0], weight[3][0], weight[4][0], weight[5][0], weight[6][0], bias[0]))
costs.append(training_cost)
weights.append(weight[:,0])
biases.append(bias[0])
saver.save(session, '../checkpoints/linreg_osm/epoch', epoch)
# The results are stored into a pandas dataframe and saved onto the file
# system.
param_history = pd.DataFrame(weights, columns=["lifespan", "n_inscription_days", "n_activity_days", "version", "n_chgset", "n_autocorr", "n_corr"])
param_history["bias"] = biases
param_history["loss"] = costs
if initial_step == 0:
param_history.to_csv("linreg_osm.csv", index=False)
else:
param_history.to_csv("linreg_osm.csv", index=False, mode='a', header=False)
# Then the data into this `csv` file is recovered. We do not consider the
# current `param_history` value, as it may represent only the last training
# steps, if the train just had been restored from a checkpoint. We use the
# results for plotting purpose.
param_history = pd.read_csv("linreg_osm.csv", index_col=False)
# Graphic display
f, ax = plt.subplots(3, 3, figsize=(12,6))
for i in range(param_history.shape[1]):
ax[i % 3][int(i / 3)].plot(param_history.iloc[:,i])
ax[i % 3][int(i / 3)].set_title(param_history.columns[i])
f.tight_layout()
f.show()
# Then the model is run with test data (this dataset was not used for model
# training). The goal is to evaluate the correspondance between true value of
# `y` and the model prediction.
print("### Test step ###")
cost, y_pred = session.run([loss, predictions], feed_dict={X: x_test, Y: y_test})
print("""Test cost = {}, i.e. +/- {:.3f} contributor(s) per OSM elements on
average""".format(cost, math.sqrt(cost/len(y_test))))
# A last plot is produced starting from the test step: it shows how good the
# predictions are.
plt.plot(y_test, y_pred, 'go')
output_min, output_max = int(min(y_pred)[0]), int(max(y_pred)[0])
plt.plot(range(output_min, output_max+2), range(output_min, output_max+2))
plt.xlabel("True values of y")
plt.ylabel("Model predictions")
plt.xlim(min(y_test)[0], max(y_test)[0]+2)
plt.ylim(output_min, output_max+2)
plt.tight_layout()
plt.show()
# Last the tensorflow session is closed.
session.close()