-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodel.py
60 lines (34 loc) · 1.54 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
from sqlalchemy import column
df = pd.read_csv("spam.csv", encoding= "ISO-8859-1") # read data set
df=df[['v1','v2']]
df.rename(columns={'v1':'spam','v2':'text'},inplace=True)
df.spam=df.spam.apply(lambda s: True if s=='spam' else False)
df.text= df.text.apply(lambda t: t.lower().translate(str.maketrans('','',string.punctuation))) # remove punctuation (but not understood by me yet)
df = df.sample(frac=1)
# print(df)
dftrain= df.iloc[:int(len(df)*0.8)]
dftest= df.iloc[int(len(df)*0.8):]
ratio_spam= dftrain.spam.mean()
# print(ratio_spam)
trainspamword= ' '.join(dftrain[dftrain.spam==True].text).split(' ')
trainnonspamword= ' '.join(dftrain[dftrain.spam==False].text).split(' ')
commonword= set(trainspamword).intersection(set(trainnonspamword))
trainspambag= dict()
for w in commonword:
trainspambag[w]= trainspamword.count(w)/len(trainspamword)
trainnonspambag= dict()
for w in commonword:
trainnonspambag[w]= trainnonspamword.count(w)/len(trainnonspamword)
# print(trainspambag)
def predict(t,verbose=False):
req= [w for w in t.split(' ') if w in trainspambag]
spamprob= [trainspambag[w] for w in req]
nonspamprob= [trainnonspambag[w] for w in req]
spamscore=sum([np.log(p) for p in spamprob])+np.log(ratio_spam)
nonspamscore=sum([np.log(p) for p in nonspamprob])+np.log(1-ratio_spam)
return (spamscore>=nonspamscore)