NovaSky-AI · SumanthRH · Feb 1, 2025 · Jan 23, 2025 · Jan 24, 2025 · Jan 24, 2025
diff --git a/skythought/tools/.githooks/pre-commit b/skythought/tools/.githooks/pre-commit
@@ -0,0 +1,10 @@
+set -e 
+
+# Get tools directory path relative to git root
+TOOLS_DIR=$(git rev-parse --show-toplevel)/skythought/tools
+# Only run pre-commit if changes are in tools/
+# Run pre-commit from tools/ directory to use linting rules in this directory
+if git diff --cached --name-only | grep "^skythought/tools/"; then
+    cd $TOOLS_DIR;
+    pre-commit run --files $(git diff --cached --name-only | grep "^skythought/tools/") --config .pre-commit-config.yaml
+fi
diff --git a/skythought/tools/.pre-commit-config.yaml b/skythought/tools/.pre-commit-config.yaml
@@ -0,0 +1,12 @@
+repos:
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.6.9
+    hooks:
+      - id: ruff
+        args: [ --fix, --exit-non-zero-on-fix ]
+
+  # Black needs to be ran after ruff with --fix
+  - repo: https://github.com/psf/black
+    rev: 23.3.0
+    hooks:
+      - id: black
diff --git a/skythought/tools/combine_data.py b/skythought/tools/combine_data.py
@@ -1,5 +1,6 @@
 import json
 import random
+
 from util.prompts import system_prompt
 
 still2_jsonl_file = "../../data/public_long_form_thought_data_5k.jsonl"
@@ -25,14 +26,11 @@
         # Create the conversation format
         conversations = [
             {"from": "user", "value": question},
-            {"from": "assistant", "value": combined_text}
+            {"from": "assistant", "value": combined_text},
         ]
-        
+
         # Prepare the final structure
-        cur_data = {
-            "system": system_prompt,
-            "conversations": conversations
-        }
+        cur_data = {"system": system_prompt, "conversations": conversations}
         all_data.append(cur_data)
     else:
         code_num += 1
@@ -43,14 +41,19 @@
 # print(code_data[0])
 
 all_data.extend(code_data)
-print(f"First item slice before shuffle: {all_data[0]['conversations'][-1]['value'][-50:-1]}")
+print(
+    f"First item slice before shuffle: {all_data[0]['conversations'][-1]['value'][-50:-1]}"
+)
 random.shuffle(all_data)
-print(f"First item slice after shuffle: {all_data[0]['conversations'][-1]['value'][-50:-1]}")
+print(
+    f"First item slice after shuffle: {all_data[0]['conversations'][-1]['value'][-50:-1]}"
+)
 print(len(all_data))
 
 # Save the converted data to the output file
 with open(output_file, "w") as f:
     json.dump(all_data, f, indent=4)
 
-print(f"Conversion completed. The data has been saved to {output_file} with {len(all_data)} data.")
-
+print(
+    f"Conversion completed. The data has been saved to {output_file} with {len(all_data)} data."
+)
diff --git a/skythought/tools/convert_format.py b/skythought/tools/convert_format.py
@@ -1,23 +1,28 @@
-import json
 import argparse
-from tqdm import tqdm
+import json
 import multiprocessing as mp
-import openai
-from itertools import cycle
-import time
 import os
+import time
+from itertools import cycle
+
+import openai
+from tqdm import tqdm
+
 from util.prompts import convert_prompt, convert_prompt_example
 
 global args
+
+
 # Function to set the OpenAI API key
 def set_openai_key(api_key):
     openai.api_key = api_key
 
+
 # GPT API processing function with retry logic
 def process_content(content, api_key):
     # Set the OpenAI key for this request
     set_openai_key(api_key)
-    
+
     # GPT prompt
     prompt = convert_prompt.format(example=convert_prompt_example, content=content)
 
@@ -28,44 +33,54 @@ def process_content(content, api_key):
             response = openai.chat.completions.create(
                 model="gpt-4o-mini",
                 messages=[
-                    {"role": "system", "content": "You are a solution format convertor."},
-                    {"role": "user", "content": prompt}
+                    {
+                        "role": "system",
+                        "content": "You are a solution format convertor.",
+                    },
+                    {"role": "user", "content": prompt},
                 ],
                 max_tokens=16384,
-                temperature=0.7
+                temperature=0.7,
             )
             return response.choices[0].message.content
         except openai.RateLimitError:
             retries -= 1
             if retries == 0:
                 return "Error: Rate limit reached and retries exhausted."
-            print(f"Sleep for 5 seconds for API limit.")
+            print("Sleep for 5 seconds for API limit.")
             time.sleep(5)
         except Exception as e:
             return f"Error processing content: {e}"
 
+
 # Function for multiprocessing
 def process_entry(entry, api_key_cycle):
     key, values = entry
     content = values["responses"]["0.7"]["content"]
-    
+
     # Get the next API key from the cycle
     api_key = next(api_key_cycle)
-    
+
     processed = process_content(content, api_key)
     values["responses"]["0.7"]["processed_content"] = processed
-    
+
     return key, values
 
+
 # Wrapper function for multiprocessing
 def process_entry_wrapper(args):
     return process_entry(*args)
 
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Process content and save results.")
-    parser.add_argument("--input_dir", type=str, help="Input directory containing JSON files.")
-    parser.add_argument("--keys", type=str, help="File containing OpenAI API keys (one per line).")
-
+    parser.add_argument(
+        "--input_dir", type=str, help="Input directory containing JSON files."
+    )
+    parser.add_argument(
+        "--keys", type=str, help="File containing OpenAI API keys (one per line)."
+    )
+
     global args
     args = parser.parse_args()
 
@@ -90,7 +105,9 @@ def process_entry_wrapper(args):
             results = []
             with mp.Pool(os.cpu_count()) as pool:
                 tasks = [(entry, api_key_cycle) for entry in data.items()]
-                for result in tqdm(pool.imap(process_entry_wrapper, tasks), total=len(data)):
+                for result in tqdm(
+                    pool.imap(process_entry_wrapper, tasks), total=len(data)
+                ):
                     results.append(result)
 
             # Aggregate and write results in the main process

diff --git a/skythought/tools/convert_to_data.py b/skythought/tools/convert_to_data.py
@@ -1,11 +1,15 @@
-import os
-import json
 import argparse
+import json
+import os
+
 from util.prompts import system_prompt
 
+
 def main():
     parser = argparse.ArgumentParser(description="Convert JSON data for processing.")
-    parser.add_argument("--input_dir", type=str, help="Directory containing input JSON files.")
+    parser.add_argument(
+        "--input_dir", type=str, help="Directory containing input JSON files."
+    )
     parser.add_argument("--output", type=str, help="Output JSON file.")
     args = parser.parse_args()
 
@@ -24,27 +28,35 @@ def main():
 
                 for cur_temp, cur_temp_response in response_data.items():
                     # Only support 0.7 for this version
-                    assert cur_temp == "0.7", "Only support a single temperature=0.7 now."
+                    assert (
+                        cur_temp == "0.7"
+                    ), "Only support a single temperature=0.7 now."
                     # Accept this data
                     if cur_temp_response["correctness"]:
                         # Create the conversation format
                         conversations = [
                             {"from": "user", "value": prompt},
-                            {"from": "assistant", "value": cur_temp_response["processed_content"]}
+                            {
+                                "from": "assistant",
+                                "value": cur_temp_response["processed_content"],
+                            },
                         ]
 
                         # Prepare the final structure
                         cur_data = {
                             "system": system_prompt,
-                            "conversations": conversations
+                            "conversations": conversations,
                         }
                         all_data.append(cur_data)
 
     # Save the converted data to the output file
     with open(args.output, "w") as f:
         json.dump(all_data, f, indent=4)
 
-    print(f"Conversion completed. The data has been saved to {args.output} with {len(all_data)} data.")
+    print(
+        f"Conversion completed. The data has been saved to {args.output} with {len(all_data)} data."
+    )
+
 
 if __name__ == "__main__":
     main()
diff --git a/skythought/tools/eval.py b/skythought/tools/eval.py
@@ -1,32 +1,45 @@
 import argparse
+import json
 import subprocess
 import os
-import json
 
-# Define eval to split mapping
-eval_to_split = {
-  "MATH500": "test", 
-  "AIME": "train", 
-  "GPQADiamond": "train", 
-  "MMLU": "test",
-  "MMLUPro": "test",
-  "LiveCodeBench": "test",
-  "GSM8K": "test",
-  "ARC-C": "test",
-  "AMC23": "train",
-}
+from skythought.tools.tasks.task_util import get_tasks
+
+module_dir = os.path.dirname(os.path.abspath(__file__))
+TASK_NAMES_TO_YAML = get_tasks(os.path.join(module_dir, "tasks"))
 
 def parse_arguments():
-    parser = argparse.ArgumentParser(description="Process model path, prompt format, and evals to run.")
+    parser = argparse.ArgumentParser(
+        description="Process model path, prompt format, and evals to run."
+    )
     parser.add_argument("--model", required=True, type=str, help="Path to the model.")
-    parser.add_argument("--evals", required=True, type=str, help="Comma-separated list of evals to run (no spaces).")
+    parser.add_argument(
+        "--evals",
+        required=True,
+        type=str,
+        help="Comma-separated list of evals to run (no spaces).",
+    )
     parser.add_argument("--tp", type=int, default=8, help="Tensor Parallelism Degree")
-    parser.add_argument("--filter-difficulty", action="store_true", help="Filter difficulty.")
+    parser.add_argument(
+        "--filter-difficulty", action="store_true", help="Filter difficulty."
+    )
     parser.add_argument("--source", type=str, help="Source for the dataset.")
-    parser.add_argument("--output_file", required=True, type=str, help="Output file to write results to.")
-    parser.add_argument("--temperatures", type=float, nargs="+", default=[0], help="Temperature for sampling.")
+    parser.add_argument(
+        "--output_file",
+        required=True,
+        type=str,
+        help="Output file to write results to.",
+    )
+    parser.add_argument(
+        "--temperatures",
+        type=float,
+        nargs="+",
+        default=[0],
+        help="Temperature for sampling.",
+    )
     return parser.parse_args()
 
+
 def extract_accuracy_from_output(output):
     # Iterate through all lines from the end to the beginning
     lines = output.splitlines()[::-1]
@@ -37,9 +50,10 @@ def extract_accuracy_from_output(output):
             if "acc" in data:
                 return data["acc"]
         except json.JSONDecodeError:
-            continue 
+            continue
     return None
 
+
 def write_logs_to_file(logs, output_file):
     try:
         with open(output_file, "w") as file:
@@ -48,6 +62,7 @@ def write_logs_to_file(logs, output_file):
     except IOError as e:
         print(f"Failed to write logs to file {output_file}: {e}")
 
+
 def main():
     args = parse_arguments()
 
@@ -60,22 +75,26 @@ def main():
 
     script_path = "inference_and_check.py"
 
-    # Hold all logs 
+    # Hold all logs
     all_logs = ""
     results = {}
-        
+
     # Run the Python command for each eval and collect logs
     for eval_name in evals:
+        eval_name = eval_name.lower()
         command = [
-            "python", script_path, 
-            "--model", model_path, 
-            "--dataset", eval_name, 
-            "--split", eval_to_split[eval_name], 
-            "--tp", str(tp),
-            "--temperatures"
+            "python",
+            script_path,
+            "--model",
+            model_path,
+            "--dataset",
+            eval_name,
+            "--tp",
+            str(tp),
+            "--temperatures",
         ]
         command.extend(temperatures)  # Add temperatures as separate arguments
-            
+
         if args.filter_difficulty:
             assert args.source != "", "No source passed for filtering difficulty."
             command.append("--filter-difficulty")
@@ -84,7 +103,9 @@ def main():
         print(f"Running eval {eval_name} with command {command}")
         all_logs += f"\nRunning eval: {eval_name} with command {command}\n"
         try:
-            with subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) as proc:
+            with subprocess.Popen(
+                command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
+            ) as proc:
                 output_lines = []
                 for line in proc.stdout:
                     print(line, end="")  # Stream output to the console
@@ -110,5 +131,6 @@ def main():
     print("Results:")
     print(results)
 
+
 if __name__ == "__main__":
     main()