-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmlbrst.py
130 lines (114 loc) · 4.86 KB
/
mlbrst.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import warnings
warnings.filterwarnings('ignore') #In case any issue occur just ignore
import numpy as np
import pandas as pd #to read the data
import matplotlib.pyplot as plt #this is for visualization
import seaborn as sns
df = pd.read_csv("https://raw.githubusercontent.com/ingledarshan/AIML-B2/main/data.csv") #this is the link where the data set exists
df.head() #it will shows the data (head function show a sample data line)
df.columns #it will shows the column name of dataset
df.info() #there is an unnamed column name we need to drop we not need to drop
df['Unnamed: 32'] #this is the column which is ntng we have to drop
df = df.drop("Unnamed: 32", axis=1)#we should drop from the dataset and should be saved in the df
df.head()
df.columns
df.drop('id', axis=1, inplace=True)#we should drop id colun bcz of no use ,here axis is used of column , inplace is used it should removed from original dataset
#or df = df.drop("id", axis=1)
df.columns
type(df.columns)
l = list(df.columns) #lists all the columns name
#there are 3 feature so we should separate
features_mean = l[1:11] #mean feature separate
features_se = l[11:21] #se feature separate
features_worst = l[21:] #worst feature separate
print(features_mean) #feature is called as column in machine
print (features_se)
print(features_worst)
df.head(2)
df['diagnosis'].unique() #shows the unique name present in that column
#m =maligant, b = benigns
df['diagnosis'].value_counts() #shows how much count of maligant and benigns
df.describe()# summary of all the numeric columns
#shows the value average (total value), mean deviation (may be between the value of count)
len(df.columns)
#corelation part
corr = df.corr()
plt.figure(figsize=(8,8))
sns.heatmap(corr); #the coorelation data will show in a heatmap
df['diagnosis'] = df['diagnosis'].map({'M':1, 'B':0})#we should target with number here diagnosis is the target m,B by replacing m=1 b=2
#he diagnosis is dependent all other are independent
X = df.drop('diagnosis', axis=1)
X.head()#here we will separate dependent and independent
Y= df['diagnosis']
Y.head()#here we will separate dependent and independent
#now we will separate the whole x,y data into two part X=xtrain ,xtest and y=ytrain,ytest
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)#here is divided 30% and 70%
#HERE the data has diff range for every colum so we have to compress
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
X_train = ss.fit_transform(X_train) #fit(study) and store in mind(x_train)
X_test = ss.transform(X_test) #and immediatly transform(take test)
#MACHINE LEARNING MODELS
#here we just see implementation and intrepret ....
#LOGISTIC REGRESSION
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, Y_train) #done studing
Y_pred = lr.predict(X_test) #take test by inputting
#now you are going to pridicted and there are many values to correction so we will take out as occuracy
from sklearn.metrics import accuracy_score
print(accuracy_score(Y_test, Y_pred))
#just for printing
lr_acc = accuracy_score(Y_test, Y_pred)
print(lr_acc)
#WE WILL CREATE A RESULT IN DATA FORM
results = pd.DataFrame()
results
#WE WILL CREATE LIKE A TABLE
tempResults = pd.DataFrame({'Algorithm':['Logistic Regression Method'], 'Accuracy':[lr_acc]})
results = pd.concat( [results, tempResults] )
results = results[['Algorithm','Accuracy']]
results
# DECISION TREE CLASSIFIER
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(X_train, Y_train)
Y_pred = dtc.predict(X_test)
Y_pred
from sklearn.metrics import accuracy_score
print(accuracy_score(Y_test, Y_pred))
dtc_acc = accuracy_score(Y_test, Y_pred)
print(dtc_acc)
tempResults = pd.DataFrame({'Algorithm':['Decision tree Classifier Method'], 'Accuracy':[dtc_acc]})
results = pd.concat( [results, tempResults] )
results = results[['Algorithm','Accuracy']]
results
#RANDOM FOREST CLASSIFIER
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train, Y_train)
Y_pred = rfc.predict(X_test)
Y_pred
from sklearn.metrics import accuracy_score
print(accuracy_score(Y_test, Y_pred))
rfc_acc = accuracy_score(Y_test, Y_pred)
print(rfc_acc)
tempResults = pd.DataFrame({'Algorithm':['Random Forest Classifier Method'], 'Accuracy':[rfc_acc]})
results = pd.concat( [results, tempResults] )
results = results[['Algorithm','Accuracy']]
results
#SUPPORT VECTOR CLASSIFIER
from sklearn import svm
svc = svm.SVC()
svc.fit(X_train,Y_train)
Y_pred = svc.predict(X_test)
Y_pred
from sklearn.metrics import accuracy_score
print(accuracy_score(Y_test, Y_pred))
svc_acc = accuracy_score(Y_test, Y_pred)
print(svc_acc)
tempResults = pd.DataFrame({'Algorithm':['Support Vector Classifier Method'], 'Accuracy':[svc_acc]})
results = pd.concat( [results, tempResults] )
results = results[['Algorithm','Accuracy']]
results