Merge branch 'main' into pixtral_integration2

axolotl-ai-cloud · Jan 27, 2025 · 57c7aec · 57c7aec
2 parents a50572e + 8875132
commit 57c7aec
Show file tree

Hide file tree

Showing 87 changed files with 1,813 additions and 1,835 deletions.
diff --git a/README.md b/README.md
@@ -519,8 +519,8 @@ See [examples](examples) for quick start. It is recommended to duplicate and mod
       train_on_split: validation
 
       # loading from s3 or gcs
-      # s3 creds will be loaded from the system default and gcs only supports public access
-    - path: s3://path_to_ds # Accepts folder with arrow/parquet or file path like above. Supports s3, gcs.
+      # s3 creds will be loaded from the system default / gcs will attempt to load from gcloud creds, google metadata service, or anon
+    - path: s3://path_to_ds # Accepts folder with arrow/parquet or file path like above
       ...
 
       # Loading Data From a Public URL

diff --git a/cicd/cicd.sh b/cicd/cicd.sh
@@ -6,5 +6,6 @@ python -c "import torch; assert '$PYTORCH_VERSION' in torch.__version__"
 pytest -v --durations=10 -n8 --ignore=tests/e2e/ --ignore=tests/patched/ /workspace/axolotl/tests/
 # pytest -v --durations=10 -n8 --dist loadfile /workspace/axolotl/tests/patched/
 pytest -v --durations=10 /workspace/axolotl/tests/e2e/patched/
+pytest -v --durations=10 -n1 /workspace/axolotl/tests/e2e/solo/
 pytest -v --durations=10 /workspace/axolotl/tests/e2e/integrations/
-pytest -v --durations=10 --ignore=tests/e2e/patched/ --ignore=tests/e2e/multigpu/ --ignore=tests/e2e/integrations/ /workspace/axolotl/tests/e2e/
+pytest -v --durations=10 --ignore=tests/e2e/solo/ --ignore=tests/e2e/patched/ --ignore=tests/e2e/multigpu/ --ignore=tests/e2e/integrations/ /workspace/axolotl/tests/e2e/
diff --git a/docker/Dockerfile-cloud b/docker/Dockerfile-cloud
@@ -20,7 +20,8 @@ RUN apt install --yes --no-install-recommends openssh-server tmux && \
     printf "\n[[ -z \"\$TMUX\"  ]] && { tmux attach-session -t ssh_tmux || tmux new-session -s ssh_tmux; exit; }\n" >> ~/.bashrc && \
     printf "[ ! -z \"\$TERM\" -a -r /etc/motd ] && cat /etc/motd\n" >> ~/.bashrc && \
     chmod +x /workspace/axolotl/scripts/cloud-entrypoint.sh && \
-    chmod +x /root/cloud-entrypoint.sh
+    chmod +x /root/cloud-entrypoint.sh && \
+    echo 'set-option -g history-limit 5000' >> ~/.tmux.conf
 
 ENTRYPOINT ["/root/cloud-entrypoint.sh"]
 CMD ["sleep", "infinity"]
diff --git a/docs/config.qmd b/docs/config.qmd
@@ -244,6 +244,8 @@ total_num_tokens:
 sample_packing_group_size: 100000
 # The number of samples which can be packed into one sequence. Increase if using a large sequence_len with many short samples.
 sample_packing_bin_size: 200
+# whether to concatenate samples during pretraining
+pretraining_sample_concatenation:
 
 # Use batch flattening for speedups when not using sample_packing
 batch_flattening:
@@ -358,10 +360,11 @@ warmup_ratio: 0.05  # cannot use with warmup_steps
 learning_rate: 0.00003
 lr_quadratic_warmup:
 logging_steps:
-eval_steps: # Leave empty to eval at each epoch, integers for every N steps. decimal for fraction of total steps
+eval_steps: # Leave empty to eval at each epoch, integer for every N steps. float for fraction of total steps
 evals_per_epoch: # number of times per epoch to run evals, mutually exclusive with eval_steps
-save_strategy: # Set to `"no"` to skip checkpoint saves
-save_steps: # Leave empty to save at each epoch
+eval_strategy: # Set to `"no"` to skip evaluation, `"epoch"` at end of each epoch, leave empty to infer from `eval_steps`.
+save_strategy: # Set to `"no"` to skip checkpoint saves, `"epoch"` at end of each epoch, `"best"` when better result is achieved, leave empty to infer from `save_steps`.
+save_steps: # Leave empty to save at each epoch, integer for every N steps. float for fraction of total steps
 saves_per_epoch: # number of times per epoch to save a checkpoint, mutually exclusive with save_steps
 save_total_limit: # Checkpoints saved at a time
 # Maximum number of iterations to train for. It precedes num_epochs which means that

diff --git a/docs/dataset-formats/pretraining.qmd b/docs/dataset-formats/pretraining.qmd
@@ -19,7 +19,14 @@ For pretraining, there is no prompt template or roles.  The only required field
 Axolotl usually loads the entire dataset into memory. This will be challenging for large datasets. Use the following config to enable streaming:
 
 ```{.yaml filename="config.yaml"}
-pretraining_dataset: # hf path only
+pretraining_dataset:
+  - name:
+    path:
+    split:
+    text_column: # column in dataset with the data, usually `text`
+    type: pretrain
+    trust_remote_code:
+    skip: # number of rows of data to skip over from the beginning
 ...
 ```
 

diff --git a/docs/lr_groups.qmd b/docs/lr_groups.qmd
@@ -0,0 +1,29 @@
+---
+title: Learning Rate Groups
+description: "Setting different learning rates by module name"
+---
+
+## Background
+
+Inspired by LoRA+, Axolotl allows practitioners to specify separate learning rates for each module or groups of
+modules in a model.
+
+## Example
+
+```yaml
+lr_groups:
+  - name: o_proj
+    modules:
+      - self_attn.o_proj.weight
+    lr: 1e-6
+  - name: q_proj
+    modules:
+      - model.layers.2.self_attn.q_proj.weight
+    lr: 1e-5
+
+learning_rate: 2e-5
+```
+
+In this example, we have a default learning rate of 2e-5 across the entire model, but we have a separate learning rate
+of 1e-6 for all the self attention `o_proj` modules across all layers, and a learning are of 1e-5 to the 3rd layer's
+self attention `q_proj` module.
diff --git a/requirements.txt b/requirements.txt
@@ -13,9 +13,9 @@ liger-kernel==0.5.2
 packaging==23.2
 
 peft==0.14.0
-transformers==4.47.1
+transformers==4.48.1
 tokenizers>=0.21.0
-accelerate==1.2.1
+accelerate==1.3.0
 datasets==3.2.0
 deepspeed==0.16.1
 trl==0.13.0

diff --git a/scripts/chat_datasets.py b/scripts/chat_datasets.py
@@ -30,7 +30,7 @@ def parse_dataset(dataset=None, split="train"):
         )
     ds_cfg["field_messages"] = field_messages
 
-    message_fields = features["conversations"][0].keys()
+    message_fields = features[field_messages][0].keys()
     message_field_role = None
     for key in ["from", "role"]:
         if key in message_fields:

diff --git a/scripts/finetune.py b/scripts/finetune.py