-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathsplit_df.py
49 lines (33 loc) · 1.25 KB
/
split_df.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# -*- coding: utf-8 -*-
"""
Created on Thu Jan 16 11:16:21 2020
@author: jacqu
Split dataframe into chunks for parallel docking using sbatch
"""
import pandas as pd
import numpy as np
import os
import argparse
def splitDataFrame(df, chunkSize):
listOfDf = list()
numberChunks = len(df) // chunkSize
for i in range(numberChunks):
listOfDf.append(df[i*chunkSize:(i+1)*chunkSize])
return listOfDf
def cline():
# Parses arguments and calls main function with these args
parser = argparse.ArgumentParser()
parser.add_argument("-df", "--dataframe", default='data/ligands/abl1_dude.csv', help="pd dataframe file to split")
parser.add_argument("-n", "--num_chunks",default=3, help="Number of chunks to split in ")
args = parser.parse_args()
main(args)
def main(args):
df = pd.read_csv(args.dataframe)
print('Dataframe loaded. Chunks will be saved in data/ligands')
chunkSize = df.shape[0]//(args.num_chunks)
chunks = splitDataFrame(df,chunkSize)
for i,c in enumerate(chunks):
chunks[i].to_csv(f'data/ligands/split_batch_{i+1}.csv')
print(f'Saved {args.num_chunks} chunks of {chunkSize} mols to data/ligands/')
if(__name__=='__main__'):
cline()