Skip to content

Commit

Permalink
fixed: directory structure
Browse files Browse the repository at this point in the history
  • Loading branch information
Anmol2059 committed Dec 13, 2024
1 parent d6cfdfd commit f2bfe84
Show file tree
Hide file tree
Showing 9 changed files with 35 additions and 229 deletions.
3 changes: 1 addition & 2 deletions models/m1_chipsal.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,7 @@ def __len__(self):
weight_decay=0.01,
logging_dir=f"{output_dir_name}/logs",
load_best_model_at_end=True,
save_total_limit=1 # Keeps only the latest checkpoint to save disk space
)
save_total_limit=1

def compute_metrics(pred):
labels = pred.label_ids
Expand Down
12 changes: 6 additions & 6 deletions models/m2_chipsal.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@
tokenizer = AutoTokenizer.from_pretrained("google/muril-base-cased")
muril_model = AutoModel.from_pretrained("google/muril-base-cased").to("cuda" if torch.cuda.is_available() else "cpu")

train_df = pd.read_csv('/home/anmol/codes/CHIPSAL/dataset/train.csv')
test_df = pd.read_csv('/home/anmol/codes/CHIPSAL/dataset/test.csv')
train_df = pd.read_csv('/codes/CHIPSAL/dataset/train.csv')
test_df = pd.read_csv('/codes/CHIPSAL/dataset/test.csv')

train_df['tweet'] = train_df['tweet'].astype(str)
test_df['tweet'] = test_df['tweet'].astype(str)
Expand Down Expand Up @@ -44,9 +44,9 @@ def get_muril_embeddings(text):
tabnet_clf.fit(
train_embeddings,
train_labels,
max_epochs=150, # Increase number of epochs
patience=10, # Early stopping after 10 epochs without improvement
batch_size=256, # Adjust batch size for your GPU memory
max_epochs=150,
patience=10,
batch_size=256,
drop_last=False
)

Expand All @@ -55,7 +55,7 @@ def get_muril_embeddings(text):
pred_labels = np.where(pred_labels == 1, 1, 0)

submission_df = pd.DataFrame({
'index': test_df['index'], # Using 'index' from the test data
'index': test_df['index'],
'prediction': pred_labels.tolist()
})

Expand Down
36 changes: 7 additions & 29 deletions models/m3_chipsal.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,15 @@
# -*- coding: utf-8 -*-
"""HateSpeech_2

Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/#fileId=https%3A//storage.googleapis.com/kaggle-colab-exported-notebooks/hatespeech-2-5962ccfa-fa95-439c-aff3-2cc51df48d36.ipynb%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com/20241106/auto/storage/goog4_request%26X-Goog-Date%3D20241106T170641Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D40face555c0fc1faa66142e80157900cf2e64e6e1dce102a87b6d7c730a256ca23407e52a59ea201d90df69a71282052135a500e86b48dc61b7af705eb7f8f70cd2e4ea86571028a32d3bb2e1096535710af455eed992abb3091097adebbb32c8bbc8c8ed91cdbddd2fdcf65b6a6923d924838a27ea9c435a653b7698eb7d1529f030d2f59fb211337ec86a3fd9c04374ce2c5e71ef5ee2027bbbba9f8b07d2cc22a720fc0b29cb34527c19e7149c69f70dd447f68e8bb25854e5ddf135a13d9c49634926ceb65c0dbd4d76c97b145e3439802f40310276e15a2aa49f4bbf972b1bbebfbbe5ffe26ed9e4def71bf1268e238b877750810269b187d2e570a9185
"""

# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.


nadspoudel_hs_dataset2_path = kagglehub.dataset_download('nadspoudel/hs-dataset2')
nadspoudel_test_dataset_path = kagglehub.dataset_download('nadspoudel/test-dataset')

print('Data source import complete.')

!pip install -q torch transformers huggingface_hub datasets
# !pip install -q torch transformers huggingface_hub datasets

from transformers import AutoModelForSequenceClassification, AutoTokenizer,TrainingArguments,Trainer
import torch
Expand Down Expand Up @@ -93,7 +79,7 @@ def __getitem__(self, idx):
def __len__(self):
return len(self.labels)

# Create train and test datasets

train_dataset = HateSpeechDataset(train_encodings, train_labels)
test_dataset = HateSpeechDataset(test_encodings, test_labels)

Expand All @@ -107,7 +93,7 @@ def __len__(self):
weight_decay=0.01,
logging_dir='/kaggle/working/logs',
logging_steps=10,
save_safetensors=False # Disable safetensors saving
save_safetensors=False

)

Expand All @@ -127,14 +113,12 @@ def __len__(self):
eval_results = trainer.evaluate()
print(f"Evaluation Results: {eval_results}")

# Get predictions and labels using trainer.predict()

pred_output = trainer.predict(test_dataset)

# Extract predictions and labels
predictions = pred_output.predictions.argmax(axis=1) # Assuming it's a classification task
predictions = pred_output.predictions.argmax(axis=1)
labels = pred_output.label_ids

# Calculate accuracy and F1 score
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

accuracy = accuracy_score(labels, predictions)
Expand All @@ -150,7 +134,6 @@ def __len__(self):

test_df['tweet'] = test_df['tweet'].apply(preprocess_text)

# Tokenize the cleaned test tweets
test_encodings = tokenizer(
test_df['tweet'].tolist(),
padding=True,
Expand All @@ -172,28 +155,23 @@ def __len__(self):

test_dataset = HateSpeechDatasetTestOnly(test_encodings)

# Make predictions on the test set
predictions = trainer.predict(test_dataset)
pred_labels = np.argmax(predictions.predictions, axis=1)

# Create a DataFrame for predictions
submission_df = pd.DataFrame({
'index': test_df['index'], # Assuming 'index' column exists in the test data
'index': test_df['index'],
'prediction': pred_labels.tolist()
})

# Sort the DataFrame by index to meet submission requirements
submission_df = submission_df.sort_values(by='index').reset_index(drop=True)

json_records = submission_df.apply(lambda row: {"index": int(row['index']), "prediction": int(row['prediction'])}, axis=1)

# Save to a JSON file
json_file_path = '/kaggle/working/submission.json'
with open(json_file_path, 'w') as json_file:
for record in json_records:
json_file.write(json.dumps(record) + '\n')

# Zip the JSON file for submission
zip_file_path = '/kaggle/working/res.zip'
with zipfile.ZipFile(zip_file_path, 'w') as zipf:
zipf.write(json_file_path, arcname='submission.json')
Expand Down
11 changes: 5 additions & 6 deletions models/m4_chipsal2.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,13 @@
tokenizer = AutoTokenizer.from_pretrained('ai4bharat/indic-bert')
model = AutoModelForSequenceClassification.from_pretrained('ai4bharat/indic-bert')

dataset_file_path = '/home/anmol/codes/CHIPSAL/final_augmented_dataset_duplicated.csv'
dataset_file_path = '/codes/CHIPSAL/final_augmented_dataset_duplicated.csv'
df = pd.read_csv(dataset_file_path)
print(df)

print(df['label'].value_counts())

def preprocess_text(text):
# Remove URLs
text = re.sub(r'http\S+', '', text)
# Remove hashtags
text = re.sub(r'#\S+', '', text)
Expand Down Expand Up @@ -70,12 +69,12 @@ def __len__(self):
learning_rate=2e-5,
per_device_train_batch_size=64,
per_device_eval_batch_size=64,
num_train_epochs=10, # Set number of epochs to 10
num_train_epochs=10,
weight_decay=0.01,
logging_dir=log_dir,
logging_steps=10,
save_strategy="epoch", # Save the model at the end of every epoch
save_total_limit=10, # Limit the number of saved models
save_strategy="epoch",
save_total_limit=10,
save_safetensors=False
)

Expand Down Expand Up @@ -139,7 +138,7 @@ def __len__(self):
pred_labels = np.argmax(predictions.predictions, axis=1)

submission_df = pd.DataFrame({
'index': test_df['index'], # Assuming 'index' column exists in the test data
'index': test_df['index'],
'prediction': pred_labels.tolist()
})

Expand Down
12 changes: 5 additions & 7 deletions models/m5_chipsal.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ def __getitem__(self, idx):
def __len__(self):
return len(self.texts)

# Prepare train and validation datasets
train_texts = train_data['tweet'].tolist()
train_labels = train_data['label'].tolist()
val_texts = val_data['tweet'].tolist()
Expand All @@ -51,18 +50,18 @@ class LSTM_CNN_IndicBERT(nn.Module):
def __init__(self, hidden_dim, num_classes):
super(LSTM_CNN_IndicBERT, self).__init__()
self.indic_bert_model = indic_bert_model
self.lstm = nn.LSTM(768, hidden_dim, batch_first=True) # IndicBERT output is 768 dimensions
self.lstm = nn.LSTM(768, hidden_dim, batch_first=True)
self.conv1 = nn.Conv1d(hidden_dim, 128, kernel_size=3, padding=1)
self.fc = nn.Linear(128, num_classes)

def forward(self, input_ids, attention_mask):
with torch.no_grad(): # Freeze IndicBERT during LSTM-CNN processing
with torch.no_grad():
bert_outputs = self.indic_bert_model(input_ids=input_ids, attention_mask=attention_mask)
last_hidden_state = bert_outputs.last_hidden_state

lstm_out, _ = self.lstm(last_hidden_state)
cnn_out = torch.relu(self.conv1(lstm_out.permute(0, 2, 1))) # Apply Conv1d
pooled = torch.mean(cnn_out, dim=-1) # Global average pooling
cnn_out = torch.relu(self.conv1(lstm_out.permute(0, 2, 1)))
pooled = torch.mean(cnn_out, dim=-1)
output = self.fc(pooled)
return output

Expand Down Expand Up @@ -147,8 +146,7 @@ def evaluate_model(model, loader):
json_file_path = f'{output_dir_name}/submission_{timestamp}.json'
submission_df.to_json(json_file_path, orient='records', lines=True)

# Zip the JSON file for subm
# ission

zip_file_path = f'{output_dir_name}_{timestamp}.zip'
with zipfile.ZipFile(zip_file_path, 'w') as zipf:
zipf.write(json_file_path, arcname=f'submission_{timestamp}.json')
Expand Down
8 changes: 4 additions & 4 deletions models/m6_chipsal7.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
xlmr_model = AutoModel.from_pretrained("xlm-roberta-base").to("cuda" if torch.cuda.is_available() else "cpu")

train_df = pd.read_csv('/home/anmol/codes/CHIPSAL/dataset/train.csv')
test_df = pd.read_csv('/home/anmol/codes/CHIPSAL/dataset/test.csv')
train_df = pd.read_csv('/codes/CHIPSAL/dataset/train.csv')
test_df = pd.read_csv('/codes/CHIPSAL/dataset/test.csv')

train_df['tweet'] = train_df['tweet'].astype(str)
test_df['tweet'] = test_df['tweet'].astype(str)
Expand All @@ -34,7 +34,7 @@ def get_xlm_roberta_embeddings(text):
train_embeddings = np.array([get_xlm_roberta_embeddings(text) for text in tqdm(train_df['tweet'], desc="Train embeddings")])
test_embeddings = np.array([get_xlm_roberta_embeddings(text) for text in tqdm(test_df['tweet'], desc="Test embeddings")])

train_labels = train_df['label'].values # Assuming 1 is hate and 0 is non-hate
train_labels = train_df['label'].values

scaler = StandardScaler()
train_embeddings = scaler.fit_transform(train_embeddings)
Expand All @@ -47,7 +47,7 @@ def get_xlm_roberta_embeddings(text):
pred_labels = log_reg_clf.predict(test_embeddings)

submission_df = pd.DataFrame({
'index': test_df['index'], # Using 'index' from the test data
'index': test_df['index'],
'prediction': pred_labels.tolist()
})

Expand Down
4 changes: 2 additions & 2 deletions models/m7_chipsal.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def __len__(self):

# Prepare train dataset
train_texts = train_df['tweet'].tolist()
train_labels = train_df['label'].tolist() # Assuming 1 is hate, 0 is non-hate
train_labels = train_df['label'].tolist()
train_dataset = HateSpeechDataset(train_texts, train_labels)

training_args = TrainingArguments(
Expand Down Expand Up @@ -83,7 +83,7 @@ def compute_metrics(pred):
pred_labels = predictions.predictions.argmax(-1)

submission_df = pd.DataFrame({
'index': test_df['index'], # Using 'index' from the test data
'index': test_df['index'],
'prediction': pred_labels.tolist()
})

Expand Down
10 changes: 5 additions & 5 deletions models/m8_chipsal12.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,14 @@
fasttext_hi = KeyedVectors.load_word2vec_format("cc.hi.300.vec.gz", binary=False)
fasttext_ne = KeyedVectors.load_word2vec_format("cc.ne.300.vec.gz", binary=False)

embedding_dim = fasttext_hi.vector_size # Assuming both have the same embedding dimension
embedding_dim = fasttext_hi.vector_size

print("Loading train and test datasets...")


print("Loading train and test datasets...")
train_df = pd.read_csv('dataset/train.csv') # Ensure 'train.csv' has 'tweet' and 'label'
test_df = pd.read_csv('dataset/test.csv') # Ensure 'test.csv' has 'tweet' and 'index' (no label for test set)
train_df = pd.read_csv('dataset/train.csv')
test_df = pd.read_csv('dataset/test.csv')

train_df['tweet'] = train_df['tweet'].astype(str).fillna('')
test_df['tweet'] = test_df['tweet'].astype(str).fillna('')
Expand Down Expand Up @@ -57,11 +57,11 @@ def get_fasttext_embeddings(self, text):
else:
emb_ne = np.zeros(embedding_dim)

# Average the embeddings if the word exists in both, otherwise use what's available

word_embedding = (emb_hi + emb_ne) / 2 if np.any(emb_hi) and np.any(emb_ne) else emb_hi if np.any(emb_hi) else emb_ne
word_embeddings.append(word_embedding)

# Return average word embeddings for the text

return np.mean(word_embeddings, axis=0) if word_embeddings else np.zeros(embedding_dim)

def __getitem__(self, idx):
Expand Down
Loading

0 comments on commit f2bfe84

Please sign in to comment.