-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvision.py
107 lines (95 loc) · 4.95 KB
/
vision.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# Install Anaconda 4.4.10 (>> conda info)
# pip install google-api-python-client
# pip install google-cloud-storage
# pip install google-auth-httplib2
# pip install PyPDF2
# Steps
# 0. Create Google Account acounts.google.com
# 1. Create GCP project - console.cloud.google.com, instructions at https://cloud.google.com/resource-manager/docs/creating-managing-projects
# 2. Create service account for project and save json key - https://cloud.google.com/docs/authentication/production#auth-cloud-implicit-python
# 3. Enable Vision API, Storage API for project - https://console.cloud.google.com/apis/library?project=form-processing
# 4. Create storage bucket matching variable name CROPPED_JPEG_BUCKET_NAME (ensure service account has permissions to modify
# Google vision api documentation - https://cloud.google.com/vision/docs/reference/rest/v1/images/annotate
from googleapiclient.discovery import build
from google.oauth2 import service_account
from google.cloud import storage
from wand.image import Image
import os
from PyPDF2 import PdfFileReader
#Required Scopes to for GCP storage and vision api
SCOPES = ['https://www.googleapis.com/auth/cloud-platform'
,'https://www.googleapis.com/auth/cloud-vision']
#service account credentials
SERVICE_ACCOUNT_CREDENTIALS_PATH = os.path.join(os.path.dirname(__file__)
, 'creds'
, 'sa-505-at-form-processing_iam_gserviceaccount_com.json')
credentials = service_account.Credentials.from_service_account_file(SERVICE_ACCOUNT_CREDENTIALS_PATH, scopes=SCOPES)
#specifiy GCS bucket name to upload image file to and read using vision api
CROPPED_JPEG_BUCKET_NAME = 'jpeg-cropped-address'
#secifiy the incoming directory which holds the PDF files
INCOMING_DIR = 'pdf-incoming'
PROCESSED_DIR = 'pdf-processed'
CROPPED_JPEG_DIR = 'jpeg_address_block'
ERROR_DIR = 'pdf-error'
TEXT_DIR = 'address_text_read'
# Can only use API developer key with publicly accessable files
#APIKEY="AIzaSyAi5cDB-YrDmLf5c6vV8tYBIwvELMu_UeY"
#create authenticated google service objects
service = build('vision', 'v1', credentials=credentials)
google_storage_client = storage.Client.from_service_account_json(SERVICE_ACCOUNT_CREDENTIALS_PATH)
#configure working bucket
cropped_jpeg_bucket = google_storage_client.get_bucket(CROPPED_JPEG_BUCKET_NAME)
def isFileTypePDF(filename):
'''
Checks whether filename is a pdf type or not by trying to open it and checking the number of pages.
File extension does not play into it.
:param filename: filename and relative path
:return: true/false
'''
try:
doc = PdfFileReader(open(filename, "rb"))
num_pages = doc.numPages
return True
except:
return False
#process PDF files in incoming directory
pdf_files = os.listdir(os.path.join(os.curdir,INCOMING_DIR))
for pdf_file in pdf_files:
if isFileTypePDF("%s/%s"%(os.path.join(os.curdir,INCOMING_DIR),pdf_file)):
#Convert the first page of PDF scan to jpeg and crop the image to the address block
image_pdf = Image(filename=("%s/%s[0]"%(os.path.join(os.curdir,INCOMING_DIR),pdf_file)), resolution=300)
image_jpeg = image_pdf.convert('jpeg')
image_jpeg_cropped = image_jpeg.crop(200,600,1100,1000)
image_jpeg.save(filename=("%s/%s.jpeg"%(os.path.join(os.curdir,CROPPED_JPEG_DIR),pdf_file)))
# upload cropped image to Google Cloud Storage
blob = cropped_jpeg_bucket.blob('%s.jpeg'%pdf_file)
blob.upload_from_filename("%s/%s.jpeg"%(os.path.join(os.curdir,CROPPED_JPEG_DIR),pdf_file))
# Read text from image (uploaded cropped JPEG)
IMAGE=("gs://%s/%s.jpeg"%(CROPPED_JPEG_BUCKET_NAME,pdf_file))
request = service.images().annotate(body={
'requests': [{
'image': {
'source': {
'gcs_image_uri': IMAGE
}
},
'features': [{
'type': 'TEXT_DETECTION',
'maxResults': 3,
}]
}],
})
responses = request.execute(num_retries=3)
output_filename = ("%s/%s.txt"%(os.path.join(os.curdir,TEXT_DIR),pdf_file))
output_file = open(output_filename,"w")
output_file.write(responses['responses'][0]['textAnnotations'][0]['description'])
output_file.close
#clean up GCS
blob.delete()
#move original file to processed directory
os.rename(("%s/%s"%(os.path.join(os.curdir,INCOMING_DIR),pdf_file))
, ("%s/%s"%(os.path.join(os.curdir,PROCESSED_DIR),pdf_file)))
else:
print('%s is not of file type PDF. Moved to %s directory to prevent processing again. Verify file.'%(pdf_file,ERROR_DIR))
os.rename(("%s/%s"%(os.path.join(os.curdir,INCOMING_DIR),pdf_file))
, ("%s/%s"%(os.path.join(os.curdir,ERROR_DIR),pdf_file)))