-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgroup_by_writer.py
81 lines (66 loc) · 2.93 KB
/
group_by_writer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import os
import sys
import pandas as pd
from PIL import Image
from tqdm import tqdm
def main(root='data/', digits_only=False):
size = (28, 28)
split = 'write_digits' if digits_only else 'write_all'
def class_list(digits_only=True):
digits = [hex(i)[2:] for i in range(ord('0'), ord('9') + 1)]
upper = [hex(i)[2:] for i in range(ord('A'), ord('Z') + 1)]
lowwer = [hex(i)[2:] for i in range(ord('a'), ord('z') + 1)]
return digits if digits_only else digits + upper + lowwer
# get the writer info from by_class
df = pd.DataFrame(columns=['file', 'target', 'label', 'writer', 'source'])
# get the writer id for each image
for label in class_list(digits_only=digits_only):
for group in range(8):
if not os.path.exists(f'{root}by_class/{label}/hsf_{group}.mit'):
continue
mit_path = f'{root}by_class/{label}/hsf_{group}.mit'
temp_df = pd.read_csv(mit_path, sep=' ', header=None, skiprows=1, names=['file', 'target'])
temp_df['label'] = label
temp_df['writer'] = temp_df['target'].apply(lambda x: x.split('/')[0])
temp_df['target'] = temp_df.apply(lambda x: f'{split}/{x["writer"]}/{x["label"]}/', axis=1)
temp_df['source'] = f'by_class/{label}/hsf_{group}/'
df = pd.concat([df, temp_df], ignore_index=True)
df = df[['file', 'source', 'target', 'writer', 'label']]
writer_list = df['writer'].unique()
label_list = df['label'].unique()
# create all directories in advance
if not os.path.exists(root + split):
os.makedirs(root + split)
for writer in writer_list:
if not os.path.exists(root + f'{split}/{writer}'):
os.makedirs(root + f'{split}/{writer}')
for label in label_list:
if not os.path.exists(root + f'{split}/{writer}/{label}'):
os.makedirs(root + f'{split}/{writer}/{label}')
print('Group images by writer...')
for index, row in tqdm(df.iterrows(), total=df.shape[0]):
source = root + row['source'] + row['file']
target = root + row['target'] + row['file']
# print(source, target)
# print(os.path.exists(source), os.path.exists(target))
# read image
img = Image.open(source)
gray = img.convert('L')
gray.thumbnail(size, Image.ANTIALIAS)
# save gray image to target
gray.save(target)
if __name__ == '__main__':
if len(sys.argv) == 2:
main(digits_only=sys.argv[1].lower()=='true')
else:
main()
# convert original image to vector of 28 * 28 = 784
# same as in LEAF: https://github.com/TalwalkarLab/leaf/blob/master/data/femnist/preprocess/data_to_json.py
# line 62 - 68
# img = Image.open(file_path)
# gray = img.convert('L')
# gray.thumbnail(size, Image.ANTIALIAS)
# arr = np.asarray(gray).copy()
# vec = arr.flatten()
# vec = vec / 255 # scale all pixel values to between 0 and 1
# vec = vec.tolist()