-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathga_functions_git.py
345 lines (322 loc) · 15.6 KB
/
ga_functions_git.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
# -*- coding: utf-8 -*-
"""
Created on Mon Nov 11 07:35:06 2019
@author: Scott McCallum
"""
import pandas as pd
import numpy as np
def M_matrix(data):
'''
Matrix of minerals from XRD
'''
# this function organizes the minerals of interest and then normalizes
# the minerals to 100%. The final mineral values are reported in percent
# matrix of mineral weight percents
M = data[['Quartz', 'Plag.', 'Calcite', 'Dolomite', 'Chlorite',
'Illite_Mica', 'Mx I_S', 'Pyrite']].copy()
# rename the columns
M.rename(columns={'Chlorite':'Chl', 'Illite_Mica':'Ill', 'Quartz':'Qtz',
'Pyrite':'Pyr', 'Calcite':'Cal', 'Dolomite':'Dol',
'Plag.':'Plg', 'Mx I_S':'MxIS'}, inplace=True)
# reorder cols (alphabetical order)
M = M[['Cal', 'Chl', 'Dol', 'Ill', 'MxIS', 'Plg', 'Pyr', 'Qtz']].copy()
m_sum = M.sum(axis=1)
# Normalize the data to 100% by looping through each column and divide by
# the sum * 100
for i in M.columns:
M[i] = M[i].div(m_sum)*100
return M
def E_matrix(data):
'''
Matrix of elements from XRF
'''
# this function organizes the elements of interest and reports the final
# output in percent
# put elements in alphabetical order
E = data[['Al', 'Ca', 'Fe', 'K', 'Mg', 'Na', 'Si']].copy()
# add unity to E
E['Unity'] = 100
return E
def F_variance_table(F):
# this function builds the empty min max tables called F_min and F_max.
# These tables start out with indices as the minerals of interest and the
# columns as the elements of interest. The min/max values are set to zero
# initially.
F_min = F.iloc[:, 0:-1].copy() # just dropping the unity column
F_max = F.iloc[:, 0:-1].copy() # just dropping the unity column
for i in F_min.columns:
# add 'min' and 'max' to the column names
F_min.rename(columns={i:i+'_min'}, inplace=True)
F_max.rename(columns={i:i+'_max'}, inplace=True)
# order the column names alphabetically
F_min = F_min.reindex(sorted(F_min.columns), axis=1)
F_max = F_max.reindex(sorted(F_max.columns), axis=1)
# at this point an all zero matrix has been generated with the correct
# column and row names for both the F_min and F_max matrices.
return F_min, F_max
def F_min_max_dictionary():
'''
Each element has a min and max value that can be obtained. In order to
determine the max value a given element is set to the max number of atoms
possible and all other elements are set to the min number of atoms possible.
The min value is determined in the exact opposite manner; a given element
is set to the min number of atoms possible for each mineral and all other
elements are set to the max number of atoms. It is necessary to work with
the number of atoms b/c this will impact the molecular weight, which then
impacts the fraction of each element. Note, b/c dolomite has some Fe
dependencies, the min/max values had to be swapped. This isn't a perfect
method, but allowing for some flex in each element allows for accomadation
of the fact that not all elements are available for use and neither are all
minerals (i.e. there is some unavoidable slop).
If new elements or minerals are added then they can simply be added to
the dictionary below.
'''
F_min_max_dict = {
# min max values
# CALCITE
'cal_ca_max' : 0.400479616306954,
'cal_ca_min' : 0.294236093632016,
'cal_mg_max' : 0.0765164862824062,
'cal_mg_min' : 0.0,
# CHLORITE
'chl_al_max' : 0.249809032198329,
'chl_al_min' : 0.0312286590659182,
'chl_fe_max' : 0.580279502904112,
'chl_fe_min' : 0.0,
'chl_mg_max' : 0.257261937503967,
'chl_mg_min' : 0.0,
'chl_si_max' : 0.206924493554328,
'chl_si_min' : 0.0316076110317201,
# DOLOMITE
'dol_ca_max' : 0.217353579175705,
'dol_ca_min' : 0.185615708794517,
'dol_mg_max' : 0.131887201735358,
'dol_mg_min' : 0.0,
'dol_fe_max' : 0.25864863613208,
'dol_fe_min' : 0.0,
# ILLITE
'ill_al_max' : 0.238759604762864,
'ill_al_min' : 0.0,
'ill_ca_max' : 0.00638564313686753,
'ill_ca_min' : 0.0,
'ill_fe_max' : 0.476984933423808,
'ill_fe_min' : 0.0,
'ill_k_max' : 0.136777052238806,
'ill_k_min' : 0.0456374088689976,
'ill_mg_max' : 0.0806639444550909,
'ill_mg_min' : 0.0,
'ill_na_max' : 0.00913214259186659,
'ill_na_min' : 0.0,
'ill_si_max' : 0.33667935157163,
'ill_si_min' : 0.0536809068537848,
# MIXED ILLITE SMECTITE
'mxis_al_max' : 0.217749588807779,
'mxis_al_min' : 0.0,
'mxis_ca_max' : 0.0410231215647742,
'mxis_ca_min' : 0.0,
'mxis_fe_max' : 0.436084186933133,
'mxis_fe_min' : 0.0,
'mxis_k_max' : 0.111266042514442,
'mxis_k_min' : 0.0,
'mxis_mg_max' : 0.0722454920832962,
'mxis_mg_min' : 0.0,
'mxis_na_max' : 0.0467814005729913,
'mxis_na_min' : 0.0,
'mxis_si_max' : 0.366758062410236,
'mxis_si_min' : 0.144126957644924,
# PLAGIOCLASE
'plg_al_max' : 0.193947236000288,
'plg_al_min' : 0.102121152477706,
'plg_ca_max' : 0.159528737462188,
'plg_ca_min' : 0.0,
'plg_k_max' : 0.00772132151108829,
'plg_k_min' : 0.0,
'plg_na_max' : 0.0981887756191517,
'plg_na_min' : 0.0,
'plg_si_max' : 0.301686177639351,
'plg_si_min' : 0.213542339767224,
# PYRITE
'pyr_fe_max' : 0.465486489640113,
'pyr_fe_min' : 0.465486489640113,
# QUARTZ
'qtz_si_min' : 0.467465468463971,
'qtz_si_max' : 0.467465468463971
}
return F_min_max_dict
def F_var_populate(F_min, F_max, F_min_max_dict):
'''
The purpose of this function is to populate the F_min and F_max matrices
with correct values that fall within the prescribed min and max values
from the dictionary (F_min_max_dict).
F_min = minimum elemental fractions allowable
F_max = maximum elemental fraction allowable
'''
# F_min builder
for r in F_min.index:
r_lwr = r.lower() # put everything in lower case
for c in F_min.columns: # loop through the columns
c_lwr = c.lower() # lower case
lookup = r_lwr + '_' + c_lwr # lookup value (row and column)
if lookup in F_min_max_dict:
# add the lookup value to the F_min matrix.
F_min.loc[r,c] = F_min_max_dict[lookup]
# F_max builder
for r in F_max.index:
r_lwr = r.lower() # lower case
for c in F_max.columns: # loop through the columns
c_lwr = c.lower() # lower case
lookup = r_lwr + '_' + c_lwr # lookup value (row and column)
if lookup in F_min_max_dict:
# add the lookup value to the F_max matrix.
F_max.loc[r,c] = F_min_max_dict[lookup]
return F_min, F_max
def M_matrix_pred(E,F):
''' Square matrix '''
# E: elements as coming from lithoscanner or XRF in parts per hundred (percent)
# M: minerals as coming from elan or XRD in percent
# F: tuned elemetal fractions as coming from XRF to XRD or Lithoscanner to ELAN
# Finv: Inverted F matrix
# E = M.F
# E.Finv = M or M = E.Finv
# add the unit column
F['Unity'] = 1
F = np.array(F) # convert to np array
# inverse of F
Finv = np.array(np.linalg.inv(F))
# solve for M part
M = E.dot(Finv)
# Add some column names
M.columns = ['Cal', 'Chl', 'Dol', 'Ill', 'MxIS', 'Plg', 'Pyr', 'Qtz']
return M
def parents(pop_size, F, F_max, F_min):
'''
The parent population will be user defined. The output is a list
of vectors (F_vecs) where each line of the list is a parent vector (a parent)
These parent vector are just a line of the fractions (F) values that
will be fed to the matrix prediction function in order to calculate
fitness. For example, an 7x7 array would be flattened into a 1x49 vector.
'''
F_vecs = [] # empty list of parent vectors
for i in range(pop_size):
# loop through each member of the population
# F_parent is initially just a copy of all but the unity column from
# the F matrix, which is a zero matrix.
F_parent = F.iloc[:,0:-1].copy() # copy F and drop the unity column
for r in F_parent.index:
# loop through each row of the parent
for c in F_parent.columns:
# loop through each column of the parent and assign a new value that
# is randomly selected between the min and max F tables
high = F_max.loc[r, c+'_max']
low = F_min.loc[r, c+'_min']
rando = np.random.uniform(low=low, high=high)
F_parent.loc[r,c] = rando
# After a single parent has been defined flatten the matrix to make
# it a vector
vector = list(np.array(F_parent).flatten())
# append this vector to F_vecs list.
F_vecs.append(vector)
return F_vecs
def fitness(pop_size, num_parents, F_vecs, F_max, M_actual, E,metric='rss'):
fit_list=[] # empty list to store fitness values
score_list=[]
for i in range(pop_size):
# note, F_max was used to defne shape, but F_min could have been used, or
# F.iloc[:, 0:-1] could have been used.
F_ind = np.reshape(F_vecs[i], np.shape(F_max))
# convert the F_ind to a df
F_ind = pd.DataFrame(F_ind)
# calculate the M predictions based on some matrix of F values
M_pred = M_matrix_pred(E, F_ind)
# calculate the fitness for the predicted mineral abundances
fitness = np.sum(np.square(M_actual - M_pred).sum(axis=1))
fit_list.append(fitness)
# score calculation uses root relative square error (rrse), which is the rss
# divided by the simple model (mean - actual)^2. In a perfect model
# rrse would be zero, but it is possible that your model can be worse
# than using the simple mean, and therefore, greater than 1. Since
# the perfect model has rrse=0, the score = 1 - rrse^0.5
rrse = np.square(M_actual - M_pred).sum(axis=0)
var = np.mean(M_actual, axis=0)
var = np.square(M_actual - var).sum(axis=0)
# the score for each mineral (if 8 minerals, then 8 scores)
score = 1-(rrse/var)**0.5
# for grid search, a single score is needed, therefore the avg of all
# scores is used. The higher this value is, the better.
score_rollup = np.mean(score)
score_list.append(score_rollup)
F_wFitness = pd.DataFrame(F_vecs)
F_wFitness['Fit'] = fit_list
F_wFitness['Score'] = score_list
# sort in the correct way depending on rss or rrse
if metric == 'rss': # root square error
F_wFitness.sort_values(by=['Fit'], inplace=True)
F_wFitness.reset_index(inplace=True, drop=True)
elif metric == 'rrse': # root relative square error
F_wFitness.sort_values(by=['Score'], inplace=True, ascending=False)
F_wFitness.reset_index(inplace=True, drop=True)
# copy of the upper half of most fit parents that will be used to define
# the other half of offspring
F_parents = F_wFitness.iloc[0:num_parents,0:-2]
return F_wFitness, F_parents, M_pred
def crossover(F_parents, num_parents):
'''
CROSSOVER
'''
next_gen = np.array(F_parents) # offspring will be appended to this
# initial array of zeros for the offspring
# offspring will be a matrix of F values where each row is a single
# individual
offspring = np.zeros(shape=np.shape(F_parents))
for k in range(num_parents):
crossover = int(np.random.uniform(low=1,high=np.shape(F_parents)[1]))
# Index of the first parent to mate.
# code below sourced from, Ahmed Gad:
# https://towardsdatascience.com/genetic-algorithm-implementation-in-python-5ab67bb124a6
parent1_idx = k%F_parents.shape[0]
# Index of the second parent to mate.
parent2_idx = (k+1)%F_parents.shape[0]
# The new offspring will have its first half of its genes taken from the
# first parent.
offspring[k, 0:crossover] = F_parents.iloc[parent1_idx, 0:crossover]
# The new offspring will have its second half of its genes taken from
# the second parent.
offspring[k, crossover:] = F_parents.iloc[parent2_idx, crossover:]
# next generation of offpsring and parents
next_gen = np.append(next_gen, offspring, axis=0)
return next_gen
def mutation(mutation_rate, F, F_max, F_min, num_parents, pop_size, next_gen):
'''
MUTATION
'''
# mutation for some randomly selected fraction of genes
if mutation_rate > 0: # this just allows the user to test 0 mutation
for j in range(mutation_rate):
# row index from the F matrix (F_max and F_min)
mut_row = int(np.random.uniform(low=0, high=np.shape(F.iloc[0,0:-1])[0]-1))
# col index from the F matrix (F_max and F_min)
mut_col = int(np.random.uniform(low=0, high=np.shape(F)[1]-1))
# the highest possible value based on the F_max data
high = F_max.iloc[mut_row, mut_col]
# the lowest possible value based on the F_min data
low = F_min.iloc[mut_row, mut_col]
# the actual mutation value base on some random number between the
# low and high (note, that if the value is between two numbers that are
# the same then the mut_value will be that value)
mut_value = np.random.uniform(low=low, high=high)
# converted index in the flattened array (next gen)
# this step was confusing as I had to translate a value from a 8x7 array
# to a vector that was 56 long. You can't just multiply the row by
# the col to get the location b/c this blows up on 0x0. Also, the
# discrepancy between zero indexing and the shape of F required the
# subtraction of 1 from the mut_row and mut_col (above)
next_gen_mut_col = int(mut_row * np.shape(F.iloc[0,0:-1])[0] + mut_col)
# num_parents+1 was used below b/c I did not want the mutation to
# occur in the parent population, just the offspring
next_gen_mut_row = int(np.random.uniform(low=num_parents+1,
high=pop_size))
next_gen[next_gen_mut_row, next_gen_mut_col] = mut_value
F_vecs = next_gen
else:
F_vecs = next_gen
return F_vecs