diff --git a/open_lm/datapreprocess/ray/tokenize_shuffle.py b/open_lm/datapreprocess/ray/tokenize_shuffle.py index 79bdea79..86a3c959 100644 --- a/open_lm/datapreprocess/ray/tokenize_shuffle.py +++ b/open_lm/datapreprocess/ray/tokenize_shuffle.py @@ -234,7 +234,7 @@ def _flush_buffer(self, folder, counter): tokens = [int(x) for x in self.buffer[i]["tokens"]] token_count += len(tokens) json_string = json.dumps(tokens) - uid = f"{tar_index_str}_{i:0{digits}}" + uid = hashlib.md5(json_string.encode()).hexdigest() sample = {"__key__": uid, "json.gz": json_string} sink.write(sample) bio.seek(0)