fixed: directory structure

naamiinepal · Dec 13, 2024 · f2bfe84 · f2bfe84
1 parent d6cfdfd
commit f2bfe84
Show file tree

Hide file tree

Showing 9 changed files with 35 additions and 229 deletions.
diff --git a/models/m1_chipsal.py b/models/m1_chipsal.py
@@ -61,8 +61,7 @@ def __len__(self):
     weight_decay=0.01,
     logging_dir=f"{output_dir_name}/logs",
     load_best_model_at_end=True,
-    save_total_limit=1  # Keeps only the latest checkpoint to save disk space
-)
+    save_total_limit=1  
 
 def compute_metrics(pred):
     labels = pred.label_ids

diff --git a/models/m2_chipsal.py b/models/m2_chipsal.py
@@ -14,8 +14,8 @@
 tokenizer = AutoTokenizer.from_pretrained("google/muril-base-cased")
 muril_model = AutoModel.from_pretrained("google/muril-base-cased").to("cuda" if torch.cuda.is_available() else "cpu")
 
-train_df = pd.read_csv('/home/anmol/codes/CHIPSAL/dataset/train.csv')
-test_df = pd.read_csv('/home/anmol/codes/CHIPSAL/dataset/test.csv')
+train_df = pd.read_csv('/codes/CHIPSAL/dataset/train.csv')
+test_df = pd.read_csv('/codes/CHIPSAL/dataset/test.csv')
 
 train_df['tweet'] = train_df['tweet'].astype(str)
 test_df['tweet'] = test_df['tweet'].astype(str)
@@ -44,9 +44,9 @@ def get_muril_embeddings(text):
 tabnet_clf.fit(
     train_embeddings,
     train_labels,
-    max_epochs=150,       # Increase number of epochs
-    patience=10,         # Early stopping after 10 epochs without improvement
-    batch_size=256,      # Adjust batch size for your GPU memory
+    max_epochs=150,      
+    patience=10,        
+    batch_size=256,     
     drop_last=False
 )
 
@@ -55,7 +55,7 @@ def get_muril_embeddings(text):
 pred_labels = np.where(pred_labels == 1, 1, 0)
 
 submission_df = pd.DataFrame({
-    'index': test_df['index'],  # Using 'index' from the test data
+    'index': test_df['index'], 
     'prediction': pred_labels.tolist()
 })
 

diff --git a/models/m3_chipsal.py b/models/m3_chipsal.py
@@ -1,29 +1,15 @@
-# -*- coding: utf-8 -*-
-"""HateSpeech_2
 
-Automatically generated by Colab.
-
-Original file is located at
-    https://colab.research.google.com/#fileId=https%3A//storage.googleapis.com/kaggle-colab-exported-notebooks/hatespeech-2-5962ccfa-fa95-439c-aff3-2cc51df48d36.ipynb%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com/20241106/auto/storage/goog4_request%26X-Goog-Date%3D20241106T170641Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D40face555c0fc1faa66142e80157900cf2e64e6e1dce102a87b6d7c730a256ca23407e52a59ea201d90df69a71282052135a500e86b48dc61b7af705eb7f8f70cd2e4ea86571028a32d3bb2e1096535710af455eed992abb3091097adebbb32c8bbc8c8ed91cdbddd2fdcf65b6a6923d924838a27ea9c435a653b7698eb7d1529f030d2f59fb211337ec86a3fd9c04374ce2c5e71ef5ee2027bbbba9f8b07d2cc22a720fc0b29cb34527c19e7149c69f70dd447f68e8bb25854e5ddf135a13d9c49634926ceb65c0dbd4d76c97b145e3439802f40310276e15a2aa49f4bbf972b1bbebfbbe5ffe26ed9e4def71bf1268e238b877750810269b187d2e570a9185
-"""
-
-# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
-# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
 import kagglehub
 kagglehub.login()
 
-# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
-# THEN FEEL FREE TO DELETE THIS CELL.
-# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
-# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
-# NOTEBOOK.
+
 
 nadspoudel_hs_dataset2_path = kagglehub.dataset_download('nadspoudel/hs-dataset2')
 nadspoudel_test_dataset_path = kagglehub.dataset_download('nadspoudel/test-dataset')
 
 print('Data source import complete.')
 
-!pip install -q torch transformers huggingface_hub datasets
+# !pip install -q torch transformers huggingface_hub datasets
 
 from transformers import AutoModelForSequenceClassification, AutoTokenizer,TrainingArguments,Trainer
 import torch
@@ -93,7 +79,7 @@ def __getitem__(self, idx):
     def __len__(self):
         return len(self.labels)
 
-# Create train and test datasets
+
 train_dataset = HateSpeechDataset(train_encodings, train_labels)
 test_dataset = HateSpeechDataset(test_encodings, test_labels)
 
@@ -107,7 +93,7 @@ def __len__(self):
     weight_decay=0.01,
     logging_dir='/kaggle/working/logs',
     logging_steps=10,
-    save_safetensors=False  # Disable safetensors saving
+    save_safetensors=False  
 
 )
 
@@ -127,14 +113,12 @@ def __len__(self):
 eval_results = trainer.evaluate()
 print(f"Evaluation Results: {eval_results}")
 
-# Get predictions and labels using trainer.predict()
+
 pred_output = trainer.predict(test_dataset)
 
-# Extract predictions and labels
-predictions = pred_output.predictions.argmax(axis=1)  # Assuming it's a classification task
+predictions = pred_output.predictions.argmax(axis=1)  
 labels = pred_output.label_ids
 
-# Calculate accuracy and F1 score
 from sklearn.metrics import accuracy_score, precision_recall_fscore_support
 
 accuracy = accuracy_score(labels, predictions)
@@ -150,7 +134,6 @@ def __len__(self):
 
 test_df['tweet'] = test_df['tweet'].apply(preprocess_text)
 
-# Tokenize the cleaned test tweets
 test_encodings = tokenizer(
     test_df['tweet'].tolist(),
     padding=True,
@@ -172,28 +155,23 @@ def __len__(self):
 
 test_dataset = HateSpeechDatasetTestOnly(test_encodings)
 
-# Make predictions on the test set
 predictions = trainer.predict(test_dataset)
 pred_labels = np.argmax(predictions.predictions, axis=1)
 
-# Create a DataFrame for predictions
 submission_df = pd.DataFrame({
-    'index': test_df['index'],  # Assuming 'index' column exists in the test data
+    'index': test_df['index'], 
     'prediction': pred_labels.tolist()
 })
 
-# Sort the DataFrame by index to meet submission requirements
 submission_df = submission_df.sort_values(by='index').reset_index(drop=True)
 
 json_records = submission_df.apply(lambda row: {"index": int(row['index']), "prediction": int(row['prediction'])}, axis=1)
 
-# Save to a JSON file
 json_file_path = '/kaggle/working/submission.json'
 with open(json_file_path, 'w') as json_file:
     for record in json_records:
         json_file.write(json.dumps(record) + '\n')
 
-# Zip the JSON file for submission
 zip_file_path = '/kaggle/working/res.zip'
 with zipfile.ZipFile(zip_file_path, 'w') as zipf:
     zipf.write(json_file_path, arcname='submission.json')

diff --git a/models/m4_chipsal2.py b/models/m4_chipsal2.py
@@ -14,14 +14,13 @@
 tokenizer = AutoTokenizer.from_pretrained('ai4bharat/indic-bert')
 model = AutoModelForSequenceClassification.from_pretrained('ai4bharat/indic-bert')
 
-dataset_file_path = '/home/anmol/codes/CHIPSAL/final_augmented_dataset_duplicated.csv'
+dataset_file_path = '/codes/CHIPSAL/final_augmented_dataset_duplicated.csv'
 df = pd.read_csv(dataset_file_path)
 print(df)
 
 print(df['label'].value_counts())
 
 def preprocess_text(text):
-    # Remove URLs
     text = re.sub(r'http\S+', '', text)
     # Remove hashtags
     text = re.sub(r'#\S+', '', text)
@@ -70,12 +69,12 @@ def __len__(self):
     learning_rate=2e-5,
     per_device_train_batch_size=64,
     per_device_eval_batch_size=64,
-    num_train_epochs=10,  # Set number of epochs to 10
+    num_train_epochs=10, 
     weight_decay=0.01,
     logging_dir=log_dir,
     logging_steps=10,
-    save_strategy="epoch",  # Save the model at the end of every epoch
-    save_total_limit=10,    # Limit the number of saved models
+    save_strategy="epoch", 
+    save_total_limit=10,    
     save_safetensors=False
 )
 
@@ -139,7 +138,7 @@ def __len__(self):
 pred_labels = np.argmax(predictions.predictions, axis=1)
 
 submission_df = pd.DataFrame({
-    'index': test_df['index'],  # Assuming 'index' column exists in the test data
+    'index': test_df['index'],  
     'prediction': pred_labels.tolist()
 })
 

diff --git a/models/m5_chipsal.py b/models/m5_chipsal.py
@@ -38,7 +38,6 @@ def __getitem__(self, idx):
     def __len__(self):
         return len(self.texts)
 
-# Prepare train and validation datasets
 train_texts = train_data['tweet'].tolist()
 train_labels = train_data['label'].tolist()
 val_texts = val_data['tweet'].tolist()
@@ -51,18 +50,18 @@ class LSTM_CNN_IndicBERT(nn.Module):
     def __init__(self, hidden_dim, num_classes):
         super(LSTM_CNN_IndicBERT, self).__init__()
         self.indic_bert_model = indic_bert_model
-        self.lstm = nn.LSTM(768, hidden_dim, batch_first=True)  # IndicBERT output is 768 dimensions
+        self.lstm = nn.LSTM(768, hidden_dim, batch_first=True) 
         self.conv1 = nn.Conv1d(hidden_dim, 128, kernel_size=3, padding=1)
         self.fc = nn.Linear(128, num_classes)
 
     def forward(self, input_ids, attention_mask):
-        with torch.no_grad():  # Freeze IndicBERT during LSTM-CNN processing
+        with torch.no_grad():  
             bert_outputs = self.indic_bert_model(input_ids=input_ids, attention_mask=attention_mask)
             last_hidden_state = bert_outputs.last_hidden_state
 
         lstm_out, _ = self.lstm(last_hidden_state)
-        cnn_out = torch.relu(self.conv1(lstm_out.permute(0, 2, 1)))  # Apply Conv1d
-        pooled = torch.mean(cnn_out, dim=-1)  # Global average pooling
+        cnn_out = torch.relu(self.conv1(lstm_out.permute(0, 2, 1))) 
+        pooled = torch.mean(cnn_out, dim=-1) 
         output = self.fc(pooled)
         return output
 
@@ -147,8 +146,7 @@ def evaluate_model(model, loader):
 json_file_path = f'{output_dir_name}/submission_{timestamp}.json'
 submission_df.to_json(json_file_path, orient='records', lines=True)
 
-# Zip the JSON file for subm
-# ission
+
 zip_file_path = f'{output_dir_name}_{timestamp}.zip'
 with zipfile.ZipFile(zip_file_path, 'w') as zipf:
     zipf.write(json_file_path, arcname=f'submission_{timestamp}.json')

diff --git a/models/m6_chipsal7.py b/models/m6_chipsal7.py
@@ -16,8 +16,8 @@
 tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
 xlmr_model = AutoModel.from_pretrained("xlm-roberta-base").to("cuda" if torch.cuda.is_available() else "cpu")
 
-train_df = pd.read_csv('/home/anmol/codes/CHIPSAL/dataset/train.csv')
-test_df = pd.read_csv('/home/anmol/codes/CHIPSAL/dataset/test.csv')
+train_df = pd.read_csv('/codes/CHIPSAL/dataset/train.csv')
+test_df = pd.read_csv('/codes/CHIPSAL/dataset/test.csv')
 
 train_df['tweet'] = train_df['tweet'].astype(str)
 test_df['tweet'] = test_df['tweet'].astype(str)
@@ -34,7 +34,7 @@ def get_xlm_roberta_embeddings(text):
 train_embeddings = np.array([get_xlm_roberta_embeddings(text) for text in tqdm(train_df['tweet'], desc="Train embeddings")])
 test_embeddings = np.array([get_xlm_roberta_embeddings(text) for text in tqdm(test_df['tweet'], desc="Test embeddings")])
 
-train_labels = train_df['label'].values  # Assuming 1 is hate and 0 is non-hate
+train_labels = train_df['label'].values  
 
 scaler = StandardScaler()
 train_embeddings = scaler.fit_transform(train_embeddings)
@@ -47,7 +47,7 @@ def get_xlm_roberta_embeddings(text):
 pred_labels = log_reg_clf.predict(test_embeddings)
 
 submission_df = pd.DataFrame({
-    'index': test_df['index'],  # Using 'index' from the test data
+    'index': test_df['index'],  
     'prediction': pred_labels.tolist()
 })
 

diff --git a/models/m7_chipsal.py b/models/m7_chipsal.py
@@ -37,7 +37,7 @@ def __len__(self):
 
 # Prepare train dataset
 train_texts = train_df['tweet'].tolist()
-train_labels = train_df['label'].tolist()  # Assuming 1 is hate, 0 is non-hate
+train_labels = train_df['label'].tolist()  
 train_dataset = HateSpeechDataset(train_texts, train_labels)
 
 training_args = TrainingArguments(
@@ -83,7 +83,7 @@ def compute_metrics(pred):
 pred_labels = predictions.predictions.argmax(-1)
 
 submission_df = pd.DataFrame({
-    'index': test_df['index'],  # Using 'index' from the test data
+    'index': test_df['index'], 
     'prediction': pred_labels.tolist()
 })
 

diff --git a/models/m8_chipsal12.py b/models/m8_chipsal12.py
@@ -15,14 +15,14 @@
 fasttext_hi = KeyedVectors.load_word2vec_format("cc.hi.300.vec.gz", binary=False)
 fasttext_ne = KeyedVectors.load_word2vec_format("cc.ne.300.vec.gz", binary=False)
 
-embedding_dim = fasttext_hi.vector_size  # Assuming both have the same embedding dimension
+embedding_dim = fasttext_hi.vector_size  
 
 print("Loading train and test datasets...")
 
 
 print("Loading train and test datasets...")
-train_df = pd.read_csv('dataset/train.csv')  # Ensure 'train.csv' has 'tweet' and 'label'
-test_df = pd.read_csv('dataset/test.csv')    # Ensure 'test.csv' has 'tweet' and 'index' (no label for test set)
+train_df = pd.read_csv('dataset/train.csv') 
+test_df = pd.read_csv('dataset/test.csv')
 
 train_df['tweet'] = train_df['tweet'].astype(str).fillna('')
 test_df['tweet'] = test_df['tweet'].astype(str).fillna('')
@@ -57,11 +57,11 @@ def get_fasttext_embeddings(self, text):
             else:
                 emb_ne = np.zeros(embedding_dim)
 
-            # Average the embeddings if the word exists in both, otherwise use what's available
+
             word_embedding = (emb_hi + emb_ne) / 2 if np.any(emb_hi) and np.any(emb_ne) else emb_hi if np.any(emb_hi) else emb_ne
             word_embeddings.append(word_embedding)
 
-        # Return average word embeddings for the text
+
         return np.mean(word_embeddings, axis=0) if word_embeddings else np.zeros(embedding_dim)
 
     def __getitem__(self, idx):