NovaSky-AI · SumanthRH · Feb 1, 2025 · Jan 23, 2025 · Jan 24, 2025 · Jan 24, 2025
diff --git a/.github/workflows/cpu_ci.yml b/.github/workflows/cpu_ci.yml
@@ -0,0 +1,52 @@
+name: Skythought evals
+
+on: [push, pull_request_target]
+
+permissions:
+  checks: write   # for status checks to appear
+  contents: read
+
+# Cancel runs for previous commits on the same branch
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  check_code_quality:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python
+        # This is the version of the action for setting up Python, not the Python version.
+        uses: actions/setup-python@v5
+        with:
+          # Semantic version range syntax or exact version of a Python version
+          python-version: '3.10'
+          cache: 'pip'
+      - name: Install dependencies
+        run: python -m pip install --upgrade pip setuptools wheel pre-commit
+      - name: Install skythought_evals
+        run: python -m pip install -e .
+      - name: Run pre-commit hooks
+        run: pre-commit run --all-files --config .pre-commit-config.yaml
+
+  tests:
+    needs: check_code_quality
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python
+        # This is the version of the action for setting up Python, not the Python version.
+        uses: actions/setup-python@v5
+        with:
+          # Semantic version range syntax or exact version of a Python version
+          python-version: '3.10'
+          cache: 'pip'
+      - name: Install dependencies
+        run: python -m pip install --upgrade pip setuptools wheel pre-commit pytest
+      - name: Install skythought_evals
+        run: python -m pip install -e .
+      - name: Run tests
+        run: python -m pytest tests/
diff --git a/.gitignore b/.gitignore
@@ -167,6 +167,5 @@ cython_debug/
 
 .json
 token_usage/
-test_*
 
 run_all.sh
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,16 @@
+repos:
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.9.3
+    hooks:
+      - id: ruff
+        args: [ --fix, --exit-non-zero-on-fix ]
+        # NOTE (sumanthrh): Many of the files excluded here are used for validating code generation, and linters do not recognize some of the logic in these files. skythought/train is excluded for now because it's a fork of Llamafactory
+        exclude: (^skythought/train|skythought_evals/tasks/taco/pyext2\.py|skythought_evals/tasks/taco/taco_util\.py|skythought_evals/tasks/apps/apps_util\.py|skythought_evals/util/prompts\.py|skythought_evals/util/model_utils\.py)$
+
+
+  # Black needs to be ran after ruff with --fix
+  - repo: https://github.com/psf/black
+    rev: 24.10.0
+    hooks:
+      - id: black
+        exclude: (^skythought/train/.*|skythought_evals/tasks/taco/pyext2\.py)$
diff --git a/format.sh b/format.sh
@@ -0,0 +1,11 @@
+
+set -e
+
+if command -v uv >/dev/null 2>&1; then
+    uv pip install -q pre-commit
+else 
+    pip install -q pre-commit
+fi
+
+# pre-commit run --all-files always runs from the root directory. we run this only on tools/ for now. 
+pre-commit run --all-files --config .pre-commit-config.yaml
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,6 @@
+[tool.ruff]
+line-length = 160
+
+[tool.ruff.lint]
+extend-select = ["E", "F", "I", "ASYNC", "B"]
+ignore = ["F811", "B006"]
diff --git a/setup.py b/setup.py
@@ -0,0 +1,23 @@
+# setup module skyevals in tools directory
+from pathlib import Path
+
+import setuptools
+
+
+def get_requirements():
+    req_path = Path("skythought/skythought_evals/requirements.txt")
+    with open(req_path, "r") as f:
+        return f.read().splitlines()
+
+
+setuptools.setup(
+    name="skythought_evals",
+    version="0.0.1",
+    package_dir={"": "skythought"},
+    packages=setuptools.find_packages(
+        where="skythought",
+        include=["skythought_evals*"],  # Only pick up skythought_evals, skip 'train'
+    ),
+    install_requires=get_requirements(),
+    python_requires=">=3.9,<3.12",  # pyext doesn't work with python 3.12
+)
diff --git a/skythought/__init__.py b/skythought/__init__.py
diff --git a/skythought/tools/.gitattributes → skythought/skythought_evals/.gitattributes b/skythought/tools/.gitattributes → skythought/skythought_evals/.gitattributes
diff --git a/skythought/tools/README.md → skythought/skythought_evals/README.md b/skythought/tools/README.md → skythought/skythought_evals/README.md
@@ -27,17 +27,17 @@ The expected output is labeled_source_0_-1.json. We also provide instructions to
 Inference the results from QwQ on several datasets. In preview version, we use data from the following dataset.
 
 ```shell
-python inference_and_check.py --dataset APPS --model Qwen/QwQ-32B-Preview --tp 8 --max_tokens 16384 --split test --source all --result-dir $SKYT_HOME/data --inference
+python inference_and_check.py --task apps --model Qwen/QwQ-32B-Preview --tp 8 --max_tokens 16384 --split test --difficulty all --result-dir $SKYT_HOME/data --inference
 
-python inference_and_check.py --dataset TACO --model Qwen/QwQ-32B-Preview --tp 8 --max_tokens 16384 --split train --source MEDIUM --filter-difficulty --result-dir $SKYT_HOME/data --inference
+python inference_and_check.py --task taco --model Qwen/QwQ-32B-Preview --tp 8 --max_tokens 16384 --split train --difficulty MEDIUM--result-dir $SKYT_HOME/data --inference
 
-python inference_and_check.py --dataset TACO --model Qwen/QwQ-32B-Preview --tp 8 --max_tokens 16384 --split test --source all --result-dir $SKYT_HOME/data --inference
+python inference_and_check.py --task taco --model Qwen/QwQ-32B-Preview --tp 8 --max_tokens 16384 --split test --difficulty all --result-dir $SKYT_HOME/data --inference
 
-python inference_and_check.py --dataset NUMINA --model Qwen/QwQ-32B-Preview --tp 8 --max_tokens 16384 --split train --source math --filter-difficulty --result-dir $SKYT_HOME/data --inference
+python inference_and_check.py --task numina --model Qwen/QwQ-32B-Preview --tp 8 --max_tokens 16384 --split train --source math --filter-difficulty --result-dir $SKYT_HOME/data --inference
 
-python inference_and_check.py --dataset NUMINA --model Qwen/QwQ-32B-Preview --tp 8 --max_tokens 16384 --split train --source amc_aime --filter-difficulty --result-dir $SKYT_HOME/data --inference
+python inference_and_check.py --task numina --model Qwen/QwQ-32B-Preview --tp 8 --max_tokens 16384 --split train --source amc_aime --filter-difficulty --result-dir $SKYT_HOME/data --inference
 
-python inference_and_check.py --dataset NUMINA --model Qwen/QwQ-32B-Preview --tp 8 --max_tokens 16384 --split train --source olympiads --end 20000 --filter-difficulty --result-dir $SKYT_HOME/data --inference
+python inference_and_check.py --task numina --model Qwen/QwQ-32B-Preview --tp 8 --max_tokens 16384 --split train --source olympiads --end 20000 --filter-difficulty --result-dir $SKYT_HOME/data --inference
 ```
 
 ### Step 2: Format the response
@@ -48,7 +48,7 @@ python convert_format.py --input_dir $SKYT_HOME/data --keys keys.txt
 
 ### Step 3: Reject Sampling on the formatted data (Example Usage with previous script)
 ```shell 
-python inference_and_check.py --dataset APPS --model Qwen/QwQ-32B-Preview --tp 8 --max_tokens 16384 --split test --source all --result-dir $SKYT_HOME/data --check
+python inference_and_check.py --task apps --model Qwen/QwQ-32B-Preview --tp 8 --max_tokens 16384 --split test --subset all --result-dir $SKYT_HOME/data --check
 ```
 Similar for other datasets.
 
@@ -67,24 +67,23 @@ Currently we support distill and reject sampling from various self-hosted models
 #### Example Usage
 
 ```shell
-python inference_and_check.py --dataset APPS --model Qwen/QwQ-32B-Preview --tp 8 --max_tokens 16384 --split test --source all --result-dir $SKYT_HOME/data
+python inference_and_check.py --task apps --model Qwen/QwQ-32B-Preview --tp 8 --max_tokens 16384 --split test --difficulty all --result-dir $SKYT_HOME/data
 
-python inference_and_check.py --dataset TACO --model Qwen/QwQ-32B-Preview --tp 8 --max_tokens 16384 --split train --source MEDIUM --filter-difficulty --result-dir $SKYT_HOME/data
+python inference_and_check.py --task taco --model Qwen/QwQ-32B-Preview --tp 8 --max_tokens 16384 --split train --difficulty MEDIUM --result-dir $SKYT_HOME/data
 
-python inference_and_check.py --dataset TACO --model Qwen/QwQ-32B-Preview --tp 8 --max_tokens 16384 --split test --source all --result-dir $SKYT_HOME/data
+python inference_and_check.py --task taco --model Qwen/QwQ-32B-Preview --tp 8 --max_tokens 16384 --split test --difficulty all --result-dir $SKYT_HOME/data
 
-python inference_and_check.py --dataset NUMINA --model Qwen/QwQ-32B-Preview --tp 8 --max_tokens 16384 --split train --source math --filter-difficulty --result-dir $SKYT_HOME/data --math_difficulty_lower_bound 4 --math_difficulty_upper_bound 9
+python inference_and_check.py --task numina --model Qwen/QwQ-32B-Preview --tp 8 --max_tokens 16384 --split train --source math --filter-difficulty --result-dir $SKYT_HOME/data --math-difficulty-lower-bound 4 --math-difficulty-upper-bound 9
 
-python inference_and_check.py --dataset NUMINA --model Qwen/QwQ-32B-Preview --tp 8 --max_tokens 16384 --split train --source amc_aime --filter-difficulty --result-dir $SKYT_HOME/data --math_difficulty_lower_bound 1 --math_difficulty_upper_bound 9
+python inference_and_check.py --task numina --model Qwen/QwQ-32B-Preview --tp 8 --max_tokens 16384 --split train --source amc_aime --filter-difficulty --result-dir $SKYT_HOME/data --math-difficulty-lower-bound 1 --math-difficulty-upper-bound 9
 
-python inference_and_check.py --dataset NUMINA --model Qwen/QwQ-32B-Preview --tp 8 --max_tokens 16384 --split train --source olympiads --end 20000 --filter-difficulty --result-dir $SKYT_HOME/data --math_difficulty_lower_bound 9 --math_difficulty_upper_bound 9
+python inference_and_check.py --task numina --model Qwen/QwQ-32B-Preview --tp 8 --max_tokens 16384 --split train --end 20000--source olympiads --filter-difficulty --result-dir $SKYT_HOME/data --math-difficulty-lower-bound 9 --math-difficulty-upper-bound 9
 ```
 
-
 #### Best-of-N Inference and Check
 ```bash
-python inference_and_check.py --dataset MATH500 --model Qwen/Qwen2-7B-Instruct --tp 4 --max_tokens 4096 --split test --result-dir ./ --inference --temperatures 0.7 --n 64
-python inference_and_check.py --dataset MATH500 --model Qwen/Qwen2-7B-Instruct --tp 4 --max_tokens 4096 --split test --result-dir ./ --check --temperatures 0.7 --n 8
+python inference_and_check.py --task math500 --model Qwen/Qwen2-7B-Instruct --tp 4 --max_tokens 4096 --split test --result-dir ./ --inference --temperatures 0.7 --n 64
+python inference_and_check.py --task math500 --model Qwen/Qwen2-7B-Instruct --tp 4 --max_tokens 4096 --split test --result-dir ./ --check --temperatures 0.7 --n 8
 ```
 
 ### Benchmark Evaluations
@@ -95,12 +94,12 @@ We provide a wrapper script `eval.py` to conveniently run reasoning benchmarks.
 **NOTE**: For reproducing `Sky-T1-32B-Preview` results on `AIME` and `GPQADiamond` dataset, pass in temperatures as `0.7`. 
 
 ```shell
-python eval.py --model NovaSky-AI/Sky-T1-32B-Preview --evals=AIME,GPQADiamond --tp=8 --output_file=results.txt --temperatures 0.7 
+python eval.py --model NovaSky-AI/Sky-T1-32B-Preview --evals=aime,gpqa_diamond --tp=8 --output_file=results.txt --temperatures 0.7 
 ```
 
 #### Example Usage
 ```shell
-python eval.py --model Qwen/QwQ-32B-Preview --evals=AIME,MATH500,GPQADiamond --tp=8 --output_file=results.txt
+python eval.py --model Qwen/QwQ-32B-Preview --evals=aime,math500,gpqa_diamond --tp=8 --output_file=results.txt
 ```
 
 Example result: `{"AIME": <aime_accuracy>, "MATH500": <math500_accuracy>, "GPQADiamond": <gpqa_diamond_accuracy>}` 
@@ -111,14 +110,14 @@ The file `response_rewrite.py` provides a pipeline for filtering and rewriting r
 To use our preference optimization pipeline, first generate and score multiple responses using `inference_and_check.py`. For example:
 
 ```shell
-python inference_and_check.py --inference --dataset MATH500 --model Qwen/Qwen2-7B-Instruct --tp 4 --max_tokens 4096 --split test --result-dir ./ --temperatures 0.7 --n 8
-python inference_and_check.py --check --dataset MATH500 --model Qwen/Qwen2-7B-Instruct --tp 4 --max_tokens 4096 --split test --result-dir ./ --temperatures 0.7 --n 8
+python inference_and_check.py --inference --task math500 --model Qwen/Qwen2-7B-Instruct --tp 4 --max_tokens 4096 --result-dir ./ --temperatures 0.7 --n 8
+python inference_and_check.py --check --task math500 --model Qwen/Qwen2-7B-Instruct --tp 4 --max_tokens 4096 --result-dir ./ --temperatures 0.7 --n 8
 ```
 
 Then, use `response_rewrite.py` to process the responses into preference pairs. By default, the shortest correct responses will be used as positive examples and the longest correct responses will be used as negative samples. The argument `--SILC` can be used to also include short incorrect responses as negative examples and long correct repsonses as positive samples.
 
 ```shell
-python response_rewrite.py --SILC --rewrite-model meta-llama/Meta-Llama-3-8B-Instruct --target-model NovaSky-AI/Sky-T1-32B-Preview --dataset [PATH_TO_GENERATED_RESPONSES] --result-dir ./ --checkpoint --tp 8
+python response_rewrite.py --SILC --rewrite-model meta-llama/Meta-Llama-3-8B-Instruct --target-model NovaSky-AI/Sky-T1-32B-Preview --task [PATH_TO_GENERATED_RESPONSES] --result-dir ./ --checkpoint --tp 8
 ```
 
 The `--checkpoint` argument can optionally be used to save intermediate files of the processed data between steps, in case of failure. 

diff --git a/skythought/skythought_evals/__init__.py b/skythought/skythought_evals/__init__.py
diff --git a/skythought/tools/base_instruct_evals.md → ...t/skythought_evals/base_instruct_evals.md b/skythought/tools/base_instruct_evals.md → ...t/skythought_evals/base_instruct_evals.md
diff --git a/skythought/tools/combine_data.py → skythought/skythought_evals/combine_data.py b/skythought/tools/combine_data.py → skythought/skythought_evals/combine_data.py
@@ -1,6 +1,7 @@
 import json
 import random
-from util.prompts import system_prompt
+
+from skythought_evals.util.prompts import system_prompt
 
 still2_jsonl_file = "../../data/public_long_form_thought_data_5k.jsonl"
 code_json_file = "../../data/converted_apps_long_form_thought_data_5k.json"
@@ -25,14 +26,11 @@
         # Create the conversation format
         conversations = [
             {"from": "user", "value": question},
-            {"from": "assistant", "value": combined_text}
+            {"from": "assistant", "value": combined_text},
         ]
-        
+
         # Prepare the final structure
-        cur_data = {
-            "system": system_prompt,
-            "conversations": conversations
-        }
+        cur_data = {"system": system_prompt, "conversations": conversations}
         all_data.append(cur_data)
     else:
         code_num += 1
@@ -43,14 +41,19 @@
 # print(code_data[0])
 
 all_data.extend(code_data)
-print(f"First item slice before shuffle: {all_data[0]['conversations'][-1]['value'][-50:-1]}")
+print(
+    f"First item slice before shuffle: {all_data[0]['conversations'][-1]['value'][-50:-1]}"
+)
 random.shuffle(all_data)
-print(f"First item slice after shuffle: {all_data[0]['conversations'][-1]['value'][-50:-1]}")
+print(
+    f"First item slice after shuffle: {all_data[0]['conversations'][-1]['value'][-50:-1]}"
+)
 print(len(all_data))
 
 # Save the converted data to the output file
 with open(output_file, "w") as f:
     json.dump(all_data, f, indent=4)
 
-print(f"Conversion completed. The data has been saved to {output_file} with {len(all_data)} data.")
-
+print(
+    f"Conversion completed. The data has been saved to {output_file} with {len(all_data)} data."
+)
diff --git a/skythought/tools/convert_format.py → ...hought/skythought_evals/convert_format.py b/skythought/tools/convert_format.py → ...hought/skythought_evals/convert_format.py
@@ -1,23 +1,27 @@
-import json
 import argparse
-from tqdm import tqdm
+import json
 import multiprocessing as mp
-import openai
-from itertools import cycle
-import time
 import os
-from util.prompts import convert_prompt, convert_prompt_example
+import time
+from itertools import cycle
+
+import openai
+from skythought_evals.util.prompts import convert_prompt, convert_prompt_example
+from tqdm import tqdm
 
 global args
+
+
 # Function to set the OpenAI API key
 def set_openai_key(api_key):
     openai.api_key = api_key
 
+
 # GPT API processing function with retry logic
 def process_content(content, api_key):
     # Set the OpenAI key for this request
     set_openai_key(api_key)
-    
+
     # GPT prompt
     prompt = convert_prompt.format(example=convert_prompt_example, content=content)
 
@@ -28,44 +32,54 @@ def process_content(content, api_key):
             response = openai.chat.completions.create(
                 model="gpt-4o-mini",
                 messages=[
-                    {"role": "system", "content": "You are a solution format convertor."},
-                    {"role": "user", "content": prompt}
+                    {
+                        "role": "system",
+                        "content": "You are a solution format convertor.",
+                    },
+                    {"role": "user", "content": prompt},
                 ],
                 max_tokens=16384,
-                temperature=0.7
+                temperature=0.7,
             )
             return response.choices[0].message.content
         except openai.RateLimitError:
             retries -= 1
             if retries == 0:
                 return "Error: Rate limit reached and retries exhausted."
-            print(f"Sleep for 5 seconds for API limit.")
+            print("Sleep for 5 seconds for API limit.")
             time.sleep(5)
         except Exception as e:
             return f"Error processing content: {e}"
 
+
 # Function for multiprocessing
 def process_entry(entry, api_key_cycle):
     key, values = entry
     content = values["responses"]["0.7"]["content"]
-    
+
     # Get the next API key from the cycle
     api_key = next(api_key_cycle)
-    
+
     processed = process_content(content, api_key)
     values["responses"]["0.7"]["processed_content"] = processed
-    
+
     return key, values
 
+
 # Wrapper function for multiprocessing
 def process_entry_wrapper(args):
     return process_entry(*args)
 
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Process content and save results.")
-    parser.add_argument("--input_dir", type=str, help="Input directory containing JSON files.")
-    parser.add_argument("--keys", type=str, help="File containing OpenAI API keys (one per line).")
-
+    parser.add_argument(
+        "--input_dir", type=str, help="Input directory containing JSON files."
+    )
+    parser.add_argument(
+        "--keys", type=str, help="File containing OpenAI API keys (one per line)."
+    )
+
     global args
     args = parser.parse_args()
 
@@ -90,7 +104,9 @@ def process_entry_wrapper(args):
             results = []
             with mp.Pool(os.cpu_count()) as pool:
                 tasks = [(entry, api_key_cycle) for entry in data.items()]
-                for result in tqdm(pool.imap(process_entry_wrapper, tasks), total=len(data)):
+                for result in tqdm(
+                    pool.imap(process_entry_wrapper, tasks), total=len(data)
+                ):
                     results.append(result)
 
             # Aggregate and write results in the main process
-Original file line number
+Diff line change
@@ Expand Up / @@ -167,6 +167,5 @@ cython_debug/ @@
     .json
     token_usage/
-    test_*
     run_all.sh