EleutherAI · Quentin-Anthony · Sep 8, 2024 · Sep 8, 2024
@@ -86,3 +86,4 @@
    "steps_per_print": 10,
    "keep_last_n_checkpoints": 4,
    "wall_clock_breakdown": true,
+}
@@ -428,9 +428,9 @@ py::array build_mapping_impl(const py::array_t<int64_t>& docs_,
                         }
 
                     }  // for (auto sent_index=sent_index_first; ...
-                }      // if (num_remain_sent > 1) {
-            }          // for (int doc=0; doc < num_docs; ++doc) {
-        }              // for (int epoch=0; epoch < num_epochs; ++epoch) {
+                }  // if (num_remain_sent > 1) {
+            }  // for (int doc=0; doc < num_docs; ++doc) {
+        }  // for (int epoch=0; epoch < num_epochs; ++epoch) {
 
         if (!second) {
             if (verbose) {
@@ -660,9 +660,9 @@ py::array build_blocks_mapping_impl(const py::array_t<int64_t>& docs_,
                             num_sent = 0;
                         }
                     }  // for (auto sent_index=sent_index_first; ...
-                }      // if (num_remain_sent > 1) {
-            }          // for (int doc=0; doc < num_docs; ++doc) {
-        }              // for (int epoch=0; epoch < num_epochs; ++epoch) {
+                }  // if (num_remain_sent > 1) {
+            }  // for (int doc=0; doc < num_docs; ++doc) {
+        }  // for (int epoch=0; epoch < num_epochs; ++epoch) {
 
         if (!second) {
             if (verbose) {

@@ -13,8 +13,10 @@
     from causal_conv1d import causal_conv1d_fn
     import einops
 except ModuleNotFoundError:
-    print( "Unable to import Mamba kernels. Install them from our requirements/requirements-mamba.txt, \
-    or directly from https://github.com/state-spaces/mamba")
+    print(
+        "Unable to import Mamba kernels. Install them from our requirements/requirements-mamba.txt, \
+    or directly from https://github.com/state-spaces/mamba"
+    )
     pass
 
 from megatron.model.norms import get_norm
@@ -44,7 +46,9 @@ def __init__(
             neox_args.mamba_use_bias_in_linears and neox_args.mamba_inner_func_fusion
         ), "Mamba fused inner fn and bias in x_proj not compatible!"
 
-        assert neox_args.intermediate_size == None or neox_args.expansion_factor == None, "Must pass either the absolute intermediate size or the relative expansion factor for the mamba projections"
+        assert (
+            neox_args.intermediate_size == None or neox_args.expansion_factor == None
+        ), "Must pass either the absolute intermediate size or the relative expansion factor for the mamba projections"
 
         # set variables, mostly following mamba defaults
         self.d_model = neox_args.hidden_size
@@ -53,7 +57,9 @@ def __init__(
         if neox_args.intermediate_size:
             self.d_inner = neox_args.intermediate_size
         else:
-            self.expand = neox_args.expansion_factor if neox_args.expansion_factor else 2 
+            self.expand = (
+                neox_args.expansion_factor if neox_args.expansion_factor else 2
+            )
             self.d_inner = int(self.expand * self.d_model)
         self.dt_rank = math.ceil(self.d_model / 16)  # rank of dt / Delta parameter
         self.dt_scale = 1.0

@@ -275,13 +275,17 @@ def __init__(self, neox_args, layer_number):
         self.layer_number = layer_number
         self.fp16 = neox_args.precision == "fp16"
         self.bf16 = neox_args.precision == "bfloat16"
-        assert neox_args.intermediate_size == None or neox_args.expansion_factor == None, "Must pass either the absolute intermediate size or the relative expansion factor for the mamba projections"
+        assert (
+            neox_args.intermediate_size == None or neox_args.expansion_factor == None
+        ), "Must pass either the absolute intermediate size or the relative expansion factor for the mamba projections"
         if not hasattr(neox_args, "dim_att"):
             neox_args.dim_att = neox_args.hidden_size
         if neox_args.intermediate_size:
             neox_args.ffn_dim = neox_args.intermediate_size
         else:
-            self.expand = neox_args.expansion_factor if neox_args.expansion_factor else 3.5
+            self.expand = (
+                neox_args.expansion_factor if neox_args.expansion_factor else 3.5
+            )
             neox_args.ffn_dim = int(self.expand * neox_args.hidden_size)
             # Make hidden size 3.5x by default. Round to nearest multiple of 32 until we add hdim rounding logic
         neox_args.ffn_dim = int(neox_args.ffn_dim // 32 * 32)

@@ -98,7 +98,9 @@ def __init__(
         MoE_mp_size=1,
     ):
         super().__init__()
-        assert neox_args.intermediate_size == None or neox_args.expansion_factor == None, "Must pass either the absolute intermediate size or the relative expansion factor for the mamba projections"
+        assert (
+            neox_args.intermediate_size == None or neox_args.expansion_factor == None
+        ), "Must pass either the absolute intermediate size or the relative expansion factor for the mamba projections"
 
         self.activation_func, self.is_gated = get_activation(neox_args)
         self.activation_type = neox_args.activation
@@ -1230,7 +1232,11 @@ def forward(self, x, attention_mask, layer_past=None):
                     raise KeyError(self.moe_type)
 
             with torch.enable_grad():
-                if self.activation == "swiglu" or self.num_experts > 1 and self.moe_type == "deepspeed":
+                if (
+                    self.activation == "swiglu"
+                    or self.num_experts > 1
+                    and self.moe_type == "deepspeed"
+                ):
                     # No dropout either
                     assert mlp_bias is None
                     output = mlp_output + attention_output

@@ -1187,7 +1187,9 @@ def validate_values(self):
                 return False
 
         # Checks.
-        if self.hidden_size % self.num_attention_heads != 0 and not ("mamba" in self.attention_config):
+        if self.hidden_size % self.num_attention_heads != 0 and not (
+            "mamba" in self.attention_config
+        ):
             error_message = (
                 self.__class__.__name__
                 + ".validate_values() hidden_size must be divisible by num_attention_heads"

@@ -31,8 +31,10 @@ def build_tokenizer(args):
     """Initialize tokenizer."""
     if args.rank == 0:
         print("> building {} tokenizer ...".format(args.tokenizer_type), flush=True)
-
-    assert args.tokenizer_type is not None, "tokenizer_type must be specified in the .yml config"
+
+    assert (
+        args.tokenizer_type is not None
+    ), "tokenizer_type must be specified in the .yml config"
 
     # Select and instantiate the tokenizer.
     if args.tokenizer_type.lower() == "GPT2BPETokenizer".lower():