-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
123 lines (98 loc) · 4.27 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# Importing the libraries needed
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaModel, RobertaTokenizer
from src.preprocess import preprocess_text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
# Data parameters
basedir = Path.cwd()
train_dir = basedir / "Datasets" / "twitter_training.csv"
test_dir = basedir / "Datasets" / "twitter_validation.csv"
df_test = basedir / "Datasets" / "training.1600000.processed.noemoticon.csv"
# Learning parameters
checkpoint = None # path to model checkpoint, None if none
BATCH_SIZE = 16 # batch size
start_epoch = 0 # start at this epoch
iterations = 1e4 # number of training iterations
WORKERS = 4 # number of workers for loading data in the DataLoader
print_freq = 500 # print training status once every __ batches
lr = 1e-4 # learning rate
def kaggle_data():
test_df = pd.read_csv(test_dir, header=None)
train_df = pd.read_csv(train_dir, header=None)
train_df.columns = ["id", "entity", "sentiment", "text"]
test_df.columns = ["id", "entity", "sentiment", "text"]
train_df = train_df[['text', 'sentiment']]
test_df = test_df[['text', 'sentiment']]
train_df.sentiment = train_df.sentiment.map({"Neutral":0, "Irrelevant":0 ,"Positive":1,"Negative":2})
test_df.sentiment = test_df.sentiment.map({"Neutral":0, "Irrelevant":0 ,"Positive":1,"Negative":2})
train_df['text'] = train_df['text'].apply(preprocess_text)
test_df['text'] = test_df['text'].apply(preprocess_text)
# Save the dataframes for further use
train_df.to_pickle('train_data_S.pkl')
test_df.to_pickle('test_data_S.pkl')
return train_df, test_df
def sentiment140_data():
df = pd.read_csv(df_test, header=None, names=['sentiment', 'ids', 'date', 'flag', 'user', 'text'],
encoding='latin-1')
df = df[['text', 'sentiment']]
df['text'] = df['text'].apply(preprocess_text)
df.sentiment = df.sentiment.map({0:0, 4:1})
# Shuffle the data
df = df.sample(frac=1).reset_index(drop=True)
# Split the data into train and test
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
# Save the dataframes for further use
train_df.to_pickle('train_data_B.pkl')
test_df.to_pickle('test_data_B.pkl')
return train_df, test_df
def logistic_regression(train_df, test_df):
'''Logistic Regression with feature extraction'''
tfidf = TfidfVectorizer(max_features=10000)
X_train = tfidf.fit_transform(train_df['text'])
X_test = tfidf.transform(test_df['text'])
y_train = train_df['sentiment']
y_test = test_df['sentiment']
# Train a logistic regression model on the training data
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
# Predict the sentiment of the validation data using the trained model
y_pred = lr.predict(X_test)
# Calculate the accuracy of the model on the validation data
accuracy = accuracy_score(y_test, y_pred)
# Print the accuracy score
print("Accuracy of Logistic Regression:", accuracy)
return
def random_forest(train_df, test_df):
'''Random Forest with feature extraction'''
tfidf = TfidfVectorizer(max_features=10000)
X_train = tfidf.fit_transform(train_df['text'])
X_test = tfidf.transform(test_df['text'])
y_train = train_df['sentiment']
y_test = test_df['sentiment']
# Train a logistic regression model on the training data
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)
# Predict the sentiment of the validation data using the trained model
y_pred = rf.predict(X_test)
# Calculate the accuracy of the model on the validation data
accuracy = accuracy_score(y_test, y_pred)
# Print the accuracy score
print("Accuracy of Random Forest:", accuracy)
return
def main():
"""
Main function
"""
train_df, test_df = kaggle_data()
# train_df, test_df = sentiment140_data()
logistic_regression(train_df, test_df)
random_forest(train_df, test_df)
if __name__ == '__main__':
main()