From aa4739357d44598a7c1e7a6f7cbaf2a17c741978 Mon Sep 17 00:00:00 2001 From: Sami Jaghouar Date: Tue, 24 Sep 2024 23:39:42 +0000 Subject: [PATCH] fix: do padding ourself --- src/zeroband/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/zeroband/data.py b/src/zeroband/data.py index afff1994..61a1a986 100644 --- a/src/zeroband/data.py +++ b/src/zeroband/data.py @@ -69,7 +69,7 @@ def get_dataloader( ds = load_dataset("allenai/c4", "en", streaming=True) def tokenize_function(data): - outputs = tokenizer(data["text"], truncation=True, max_length=seq_length, padding="max_length") + outputs = tokenizer(data["text"], truncation=True, max_length=seq_length) return outputs tokenized_datasets = ds.map(