forked from UKPLab/5pils
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathutils.py
121 lines (103 loc) · 3.66 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import os
import json
import glob
import base64
def load_json(file_path):
'''
Load json file
'''
with open(file_path, 'r', encoding='utf-8') as json_file:
data = json.load(json_file)
return data
def concatenate_entry(d):
'''
For all keys in a dictionary, if a value is a list, concatenate it.
'''
for key, value in d.items():
if isinstance(value, list):
d[key] = ';'.join(map(str, value)) # Convert list to a string separated by ';'
return d
def append_to_json(file_path, data):
'''
Append a dict or a list of dicts to a JSON file.
'''
try:
if not os.path.exists(file_path):
# Create an empty JSON file with an empty list if it does not exist yet
with open(file_path, 'w') as file:
json.dump([], file)
#Open the existing file
with open(file_path, 'r+') as file:
file_data = json.load(file)
if type(data)==list:
for d in data:
if type(d)==dict:
file_data.append(concatenate_entry(d))
else:
file_data.append(concatenate_entry(data))
file.seek(0)
json.dump(file_data, file, indent=4)
except json.JSONDecodeError:
print(f"Error: {file_path} is not a valid JSON file.")
def save_result(output,json_file_path):
'''
Save output results to a JSON file.
'''
try:
if type(output)==str:
user_data = json.loads(output)
append_to_json(json_file_path, user_data)
else:
append_to_json(json_file_path, output)
except json.JSONDecodeError:
#The output was not well formatted
pass
def entry_exists(json_file_path, url):
'''
Check if an entry for the given URL already exists in the JSON file.
'''
try:
with open(json_file_path, 'r') as file:
data = json.load(file)
return any(entry.get("URL").split('/')[-1] == url.split('/')[-1].split('.')[0] for entry in data)
except json.JSONDecodeError:
print(f"Error: {json_file_path} is not a valid JSON file.")
return False
except FileNotFoundError:
return False
def is_folder_empty(folder_path):
'''
Check if the given folder is empty.
'''
if os.path.exists(folder_path) and os.path.isdir(folder_path):
return not os.listdir(folder_path)
def get_corpus(directory, json_file_path,image_directory):
'''
Process each text file in the given directory.
'''
text_files = []
corpus = []
# Identify the text files
for file in os.listdir(directory):
if file.endswith('.txt'):
text_files.append(os.path.join(directory, file))
# Process each text file
for txt_file in text_files:
txt_file_name = os.path.basename(txt_file)
image_folder_name = txt_file_name[:-4] # Remove '.txt'
image_folder_path = os.path.join(image_directory, image_folder_name)
if is_folder_empty(image_folder_path):
continue
if entry_exists(json_file_path, txt_file):
continue
with open(txt_file, 'r',encoding='utf-8') as file:
text = file.read()
text = text.split('Image URLs')[0]
corpus.append(text)
return corpus
def encode_image(image_path):
'''
Encode images in base64. Format required by GPT4-Vision.
'''
with open(image_path, "rb") as image_file:
return f"data:image/png;base64,{base64.b64encode(image_file.read()).decode('utf-8')}"