-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlambda_function.py
58 lines (44 loc) · 1.79 KB
/
lambda_function.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import json
import urllib.parse
import boto3
import os
print('Loading function')
s3 = boto3.client('s3')
# Amazon Textract client
textract = boto3.client('textract')
def getTextractData(bucketName, documentKey):
print('Loading getTextractData')
# Call Amazon Textract
response = textract.detect_document_text(
Document={
'S3Object': {
'Bucket': bucketName,
'Name': documentKey
}
})
detectedText = ''
# Print detected text
for item in response['Blocks']:
if item['BlockType'] == 'LINE':
detectedText += item['Text'] + '\n'
return detectedText
def writeTextractToS3File(textractData, bucketName, createdS3Document):
print('Loading writeTextractToS3File')
generateFilePath = os.path.splitext(createdS3Document)[0] + '.txt'
generateFilePath = generateFilePath.replace('input', 'output')
s3.put_object(Body=textractData, Bucket=bucketName, Key=generateFilePath)
print('Generated ' + generateFilePath)
def lambda_handler(event, context):
#print("Received event: " + json.dumps(event, indent=2))
# Get the object from the event and show its content type
bucket = event['Records'][0]['s3']['bucket']['name']
key = urllib.parse.unquote_plus(event['Records'][0]['s3']['object']['key'], encoding='utf-8')
try:
response = s3.get_object(Bucket=bucket, Key=key)
detectedText = getTextractData(bucket, key)
writeTextractToS3File(detectedText, bucket, key)
return 'Processing Done!'
except Exception as e:
print(e)
print('Error getting object {} from bucket {}. Make sure they exist and your bucket is in the same region as this function.'.format(key, bucket))
raise e