Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Text Classification for Spam Detection #265

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions src/apps/app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import streamlit as st
from utils.predict import predict_message

st.title("Spam Detection System")
st.write("Enter a message to classify it as Spam or Not Spam.")

# Input field
user_input = st.text_area("Message")

if st.button("Classify"):
if user_input.strip():
prediction, confidence = predict_message(user_input)
st.subheader(f"Prediction: {prediction}")
st.write(f"Confidence: {confidence:.2%}")
else:
st.error("Please enter a valid message!")
40 changes: 40 additions & 0 deletions src/apps/pages/models/model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import pickle
from utils.preprocess import Preprocessor
import os

# Create dataset folder if it doesn't exist
if not os.path.exists("dataset"):
os.makedirs("dataset")
print("Dataset folder created. Please place spam.csv in the dataset folder.")
exit()


# Load dataset
df = pd.read_csv("dataset/spam.csv", encoding="latin-1")[["v1", "v2"]]
df.columns = ["label", "message"]
df["label"] = df["label"].map({"ham": 0, "spam": 1})

# Preprocess data
preprocessor = Preprocessor()
X = preprocessor.fit_transform(df["message"])
y = df["label"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = MultinomialNB()
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

# Save model and vectorizer
with open("utils/model.pkl", "wb") as model_file, open("utils/vectorizer.pkl", "wb") as vec_file:
pickle.dump(model, model_file)
pickle.dump(preprocessor.vectorizer, vec_file)
4 changes: 4 additions & 0 deletions src/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
streamlit
scikit-learn
pandas
numpy
14 changes: 14 additions & 0 deletions src/utils/predict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import pickle
from utils.preprocess import Preprocessor

# Load model and vectorizer
with open("utils/model.pkl", "rb") as model_file, open("utils/vectorizer.pkl", "rb") as vec_file:
model = pickle.load(model_file)
vectorizer = pickle.load(vec_file)

def predict_message(message):
preprocessor = Preprocessor()
transformed_message = vectorizer.transform([preprocessor.clean_text(message)])
prediction = model.predict(transformed_message)[0]
confidence = model.predict_proba(transformed_message).max()
return ("Spam" if prediction == 1 else "Not Spam", confidence)
19 changes: 19 additions & 0 deletions src/utils/preprocess.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import re
from sklearn.feature_extraction.text import TfidfVectorizer

class Preprocessor:
def __init__(self):
self.vectorizer = TfidfVectorizer(stop_words="english", max_features=3000)

def clean_text(self, text):
text = re.sub(r"[^a-zA-Z\s]", "", text) # Remove special characters
text = re.sub(r"\s+", " ", text) # Remove extra spaces
return text.lower()

def fit_transform(self, messages):
cleaned_messages = [self.clean_text(msg) for msg in messages]
return self.vectorizer.fit_transform(cleaned_messages)

def transform(self, messages):
cleaned_messages = [self.clean_text(msg) for msg in messages]
return self.vectorizer.transform(cleaned_messages)
Loading