Skip to content

Commit

Permalink
Update the training notebook with the latest training updates
Browse files Browse the repository at this point in the history
  • Loading branch information
jshuadvd committed Jul 15, 2024
1 parent 324b811 commit 5189c33
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 16 deletions.
29 changes: 16 additions & 13 deletions notebooks/01_LongRoPE_training.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -168,20 +168,23 @@
" \"\"\"\n",
" sequences = []\n",
" start = 0\n",
" while start < len(data):\n",
" end = start + max_length\n",
" chunk = data[start:end]\n",
" # tokenized_chunk = tokenizer.encode(chunk)\n",
" # Cache the tokenized chunk\n",
" tokenized_chunk = cached_tokenize(chunk, tokenizer)\n",
"\n",
" # Create sliding window sequences from the tokenized chunk\n",
" chunk_sequences = create_sliding_window_chunks(\n",
" tokenized_chunk, max_length=max_length, overlap=overlap\n",
" )\n",
" sequences.extend(chunk_sequences)\n",
" total_chunks = (len(data) - overlap) // (max_length - overlap)\n",
"\n",
" with tqdm(total=total_chunks, desc=\"Preprocessing\") as pbar:\n",
" while start < len(data):\n",
" end = start + max_length\n",
" chunk = data[start:end]\n",
" # Cache the tokenized chunk\n",
" tokenized_chunk = cached_tokenize(chunk, tokenizer)\n",
"\n",
" # Create sliding window sequences from the tokenized chunk\n",
" chunk_sequences = create_sliding_window_chunks(\n",
" tokenized_chunk, max_length=max_length, overlap=overlap\n",
" )\n",
" sequences.extend(chunk_sequences)\n",
"\n",
" start = end - overlap\n",
" start = end - overlap\n",
" pbar.update(1)\n",
"\n",
" return sequences\n",
"\n",
Expand Down
3 changes: 0 additions & 3 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,9 +134,6 @@ def cached_tokenize(text, tokenizer, cache_dir="tokenizer_cache"):
return tokenized


from tqdm import tqdm


def preprocess_data(data, tokenizer, max_length, overlap):
"""
Preprocess the input data by tokenizing it in chunks and creating sliding window sequences.
Expand Down

0 comments on commit 5189c33

Please sign in to comment.