-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmad.py
118 lines (104 loc) · 3.97 KB
/
mad.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
# Author: Arjun S Kulathuvayal. Intellectual property. Copyright strictly restricted
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
def pca(df):
df_normalized = (df - df.mean()) / df.std()
pca = PCA(n_components=df.shape[1])
pca.fit(df_normalized)
loadings = pd.DataFrame(pca.components_.T,
columns=['PC%s' % _ for _ in range(len(df_normalized.columns))],
index=df.columns)
print(loadings)
plt.plot(pca.explained_variance_ratio_)
plt.ylabel('Variance')
plt.xlabel('Components')
plt.show()
def mad(df):
# df = pd.read_csv('vehicles.csv', usecols=[i for i in range(18)])
mad = df.mad()
print("Mean absolute deviation")
print("------------------------")
print(mad)
print("Best features are those that have the smallest MAD values.")
print("------------------------------------------------------------")
x = dict(mad)
# print(x)
best_features = []
for key, value in x.items():
if value < 7:
best_features.append(key)
print(best_features)
#
print("Correlation matrix of dataframe")
print("-------------------------------")
corrM = df.corr()
print(corrM)
cor_matrix = df.corr().abs()
upper_tri = corrM.where(np.triu(np.ones(corrM.shape), k=1).astype(np.bool))
print("Upper triangular matrix")
print("------------------------")
print(upper_tri)
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.95)]
print("Column to be droped")
print("------------------------")
print(to_drop)
def pcaPer(df):
df_normalized = (df - df.mean()) / df.std()
pca = PCA(n_components=df.shape[1])
pca.fit(df_normalized)
loadings = pd.DataFrame(pca.components_.T, columns=['PC%s' % _ for _ in range(len(df_normalized.columns))],
index=df.columns)
fig, ax = plt.subplots()
x = np.arange(1, 6, 1)
y = np.cumsum(pca.explained_variance_ratio_)
plt.plot(x, y, '-o')
plt.xlabel('Number of components')
plt.ylabel('cumulative variance')
plt.title('Number of components needed to describe variance')
plt.axhline(y=0.90, color='r', linestyle='--')
plt.text(0.5, 0.85, '90% cut-off threshold ~ 5 components', color='red', fontsize='12')
plt.text(8, 0.95, '99% cut-off threshold ~ 10 components', color='green', fontsize='12')
plt.axhline(y=0.99, color='g', linestyle='--')
ax.grid(axis='x')
plt.show()
print("Number of componets required becomes exponentially higher for higher cumulative variance")
def pcadis(df):
def myplot(score,coeff,labels=None):
y = df['a'].copy()
X = df.drop(columns=['a']).copy().values
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)
pca = PCA()
x_new = pca.fit_transform(X)
xs = score[:,0]
ys = score[:,1]
n = coeff.shape[0]
scalex = 1.0/(xs.max() - xs.min())
scaley = 1.0/(ys.max() - ys.min())
plt.scatter(xs * scalex,ys * scaley, c = y)
plt.xlim(-1,1)
plt.ylim(-1,1)
plt.xlabel("PC{}".format(1))
plt.ylabel("PC{}".format(2))
plt.grid()
#I'am using only the 2 PCs here (also scalled the data points - above )
myplot(x_new[:,0:2],np.transpose(pca.components_[0:2, :]))
plt.show()
if __name__ == "__main__":
df = pd.read_csv('icsd_data_formula.csv', usecols=[3, 4, 5, 6, 7, 8])
df = df.dropna(how='any', subset=['a', 'b', 'c', 'alpha', 'beta', 'gamma'])
df = df.drop(df[df.c > 55].index)
df = df.drop(df[df.b > 28].index)
df = df.drop(df[df.a > 28].index)
df = df.drop(df[df.alpha < 60].index)
df = df.drop(df[df.beta < 70].index)
#mad(df)
#pca(df)
#pcaPer(df)
pcsdis(df)