-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpdfprofessor.py
173 lines (155 loc) · 6.46 KB
/
pdfprofessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import os
import fitz # PyMuPDF
import subprocess
import concurrent.futures
import logging
import json
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# Load configuration from a JSON file
def load_config(config_file='config.json'):
with open(config_file, 'r') as f:
return json.load(f)
config = load_config()
pdf_directory = config['pdf_directory']
output_directory = config['output_directory'] # Or a fixed path if you prefer
log_directory = config['log_directory']
chunk_storage_directory = config['chunk_storage_directory']
ollama_command = config['ollama_command']
chunk_size = config['chunk_size']
progress_log_file = os.path.join(log_directory, 'progress.log')
scripts_directory = "/Users/mbaosint/Desktop/Projects/PDF-Professor/Scripts"
# Create directories if they do not exist
os.makedirs(output_directory, exist_ok=True)
os.makedirs(log_directory, exist_ok=True)
os.makedirs(chunk_storage_directory, exist_ok=True)
os.makedirs(scripts_directory, exist_ok=True)
# Function to load progress log
def load_progress_log():
if os.path.exists(progress_log_file):
with open(progress_log_file, 'r') as f:
return json.load(f)
return {}
# Function to update progress log
def update_progress_log(file_name, chunk_index):
progress_log[file_name] = chunk_index
with open(progress_log_file, 'w') as f:
json.dump(progress_log, f)
# Function to process text with Ollama
def process_with_ollama(text_chunk, user_prompt):
combined_prompt = f"{user_prompt}\n\n{text_chunk}"
try:
process = subprocess.Popen(
ollama_command,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True
)
stdout, stderr = process.communicate(input=combined_prompt, timeout=120)
if process.returncode != 0:
logging.error(f"Ollama processing failed: {stderr}")
return text_chunk
return stdout
except subprocess.TimeoutExpired:
logging.error("Ollama processing timed out!")
return text_chunk
except Exception as e:
logging.error(f"Error in process_with_ollama: {e}")
return text_chunk
# Function to train the Ollama model
def train_ollama_model(text):
try:
process = subprocess.Popen(
ollama_command,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True
)
stdout, stderr = process.communicate(input=text, timeout=120)
if process.returncode != 0:
logging.error(f"Ollama training failed: {stderr}")
return stderr
return stdout
except subprocess.TimeoutExpired:
logging.error("Ollama training timed out!")
return "Training timeout"
except Exception as e:
logging.error(f"Error in train_ollama_model: {e}")
return str(e)
# Function to save processed data locally
def save_data_locally(file_name, data, directory):
file_path = os.path.join(directory, file_name)
try:
with open(file_path, 'w', encoding='utf-8') as file:
file.write(data)
return file_path
except Exception as e:
logging.error(f"Error saving data locally: {e}")
return None
# Function to read PDF files
def read_pdf(file_path):
try:
doc = fitz.open(file_path)
content = ''
for page_num in range(doc.page_count):
page = doc.load_page(page_num)
content += page.get_text()
return content
except Exception as e:
logging.warning(f"PyMuPDF failed to read {file_path}, falling back to poppler: {e}")
try:
from subprocess import run
text = run(["pdftotext", file_path, "-"], capture_output=True, text=True).stdout
return text
except Exception as e:
logging.error(f"Poppler also failed to read {file_path}: {e}")
return None
def process_file(file_name, user_prompt):
try:
file_path = os.path.join(pdf_directory, file_name)
content = read_pdf(file_path)
if content:
chunks = [content[i:i + chunk_size] for i in range(0, len(content), chunk_size)]
processed_content = ''
last_processed_chunk = progress_log.get(file_name, 0)
with tqdm(total=len(chunks), desc=f'Processing {file_name}', dynamic_ncols=True) as pbar:
with ThreadPoolExecutor(max_workers=1) as executor:
future_chunks = {
executor.submit(process_with_ollama, chunk, user_prompt): i
for i, chunk in enumerate(chunks)
if i >= last_processed_chunk
}
for future in as_completed(future_chunks):
processed_chunk = future.result()
chunk_index = future_chunks[future]
processed_content += processed_chunk
update_progress_log(file_name, chunk_index + 1)
pbar.update(1)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
script_name = f"script_{timestamp}.txt"
save_data_locally(script_name, processed_content, scripts_directory)
train_output = train_ollama_model(processed_content)
logging.info(f"Processed {file_name}, trained: {train_output}")
return f"Processed {file_name}, trained: {train_output}"
else:
logging.error(f"Failed to read file: {file_name}")
return f"Failed to read file: {file_name}"
except Exception as e:
logging.error(f"Error processing file {file_name}: {e}")
return f"Error processing file {file_name}: {e}"
def main():
global progress_log
progress_log = load_progress_log()
user_prompt = input("Enter your prompt for Ollama: ")
pdf_files = [f for f in os.listdir(pdf_directory) if f.endswith('.pdf')]
with ThreadPoolExecutor(max_workers=2) as executor:
futures = {executor.submit(process_file, file_name, user_prompt): file_name for file_name in pdf_files}
for future in tqdm(as_completed(futures), total=len(futures), desc='Processing PDFs'):
logging.info(future.result())
if __name__ == "__main__":
main()