-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathreduce_df__memory_size.py
72 lines (62 loc) · 2.95 KB
/
reduce_df__memory_size.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import numpy as np
def reduce_mem_usage(props, print_flag=False):
start_mem_usg = props.memory_usage().sum() / 1024 ** 2
if print_flag:
print("Memory usage of properties dataframe is :", start_mem_usg, " MB")
NAlist = [] # Keeps track of columns that have missing values filled in.
for col in props.columns:
if props[col].dtype != object: # Exclude strings
if print_flag:
# Print current column type
print("******************************")
print("Column: ", col)
print("dtype before: ", props[col].dtype)
# make variables for Int, max and min
IsInt = False
mx = props[col].max()
mn = props[col].min()
# Integer does not support NA, therefore, NA needs to be filled
if not np.isfinite(props[col]).all():
NAlist.append(col)
props[col].fillna(mn - 1, inplace=True)
# test if column can be converted to an integer
asint = props[col].fillna(0)
asint = asint.astype(np.int64)
result = (props[col] - asint)
result = result.sum()
if -0.01 < result < 0.01:
IsInt = True
# Make Integer/unsigned Integer datatypes
if IsInt:
if mn >= 0:
if mx < 255:
props[col] = props[col].astype(np.uint8)
elif mx < 65535:
props[col] = props[col].astype(np.uint16)
elif mx < 4294967295:
props[col] = props[col].astype(np.uint32)
else:
props[col] = props[col].astype(np.uint64)
else:
if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
props[col] = props[col].astype(np.int8)
elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
props[col] = props[col].astype(np.int16)
elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
props[col] = props[col].astype(np.int32)
elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
props[col] = props[col].astype(np.int64)
# Make float datatypes 32 bit
else:
props[col] = props[col].astype(np.float32)
if print_flag:
# Print new column type
print("dtype after: ", props[col].dtype)
print("******************************")
# Print final result
if print_flag:
print("___MEMORY USAGE AFTER COMPLETION:___")
mem_usg = props.memory_usage().sum() / 1024 ** 2
print("Memory usage is: ", mem_usg, " MB")
print("This is ", 100 * mem_usg / start_mem_usg, "% of the initial size")
return props, NAlist