-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdoc_scraping.py
795 lines (698 loc) · 38.8 KB
/
doc_scraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
# imports
# NOTE: a couple local installations are required: tesseract and poppler
# for poppler installation instructions, see here (steps differ by operating system): https://pdf2image.readthedocs.io/en/latest/installation.html
# for tesseract installation, see here: https://tesseract-ocr.github.io/tessdoc/Installation.html
import numpy as np
import pandas as pd
import os
import re
from pdf2image import convert_from_path
from scipy import ndimage
import pytesseract
import matplotlib.pyplot as plt
from boxdetect import config
from boxdetect.pipelines import get_boxes
# add the tesseract executable to your PATH, or run the following command
# pytesseract.pytesseract.tesseract_cmd =r"/usr/local/Cellar/tesseract/5.3.4/bin/tesseract"
# you can test if tesseract is installed by calling `tesseract` in your command line (without the backticks)
DPI = 300
def get_file(case_number, file_type, file_dir):
""" Retrieve the path to a desired file given the file description and case
number.
Args:
case_number (str): case identifier, alphanumeric
file_type (str): the name of the legal file,
e.g. "complaint", "civil_case_cover_sheet"
file_dir (str): path to the folder containg all the scanned legal
documents
Returns: list of strings containing the file paths
"""
case_number = case_number.lower()
file_type = file_type.lower()
possible_files = []
for f in os.listdir(file_dir):
if f.lower().startswith(case_number) and file_type in f.lower():
if file_type != 'complaint' or (file_type == 'complaint'
and 'summons' not in f.lower()):
possible_files.append(os.path.join(file_dir, f))
return possible_files
def cover_sheet_last_page_image(case_number, file_dir):
""" Gets the last page of the cover sheet, assuming it should have at least
6 pages.
Note that the address is almost always on the 6th page (occasionally
there is a 7th page that it will be found on, sometimes there are fewer
pages and missing an address)
Args:
case_number (str): case identifier, alphanumeric
file_dir (str): path to the folder containg all the scanned legal
documents
Returns: PIL image
"""
# retrieve civil case cover file sheet
fpaths = get_file(case_number, 'civil_case_cover_sheet', file_dir)
if len(fpaths) == 0:
raise Exception(
f'could not find civil case cover sheet for case {case_number}')
elif len(fpaths) > 1:
raise Exception(
f'found {len(fpaths)} civil case cover sheets for case {case_number}: {fpaths}'
)
# convert pdf to image
# the address is almost always on the last (6th) sheet
images = convert_from_path(fpaths[0], dpi=DPI, first_page=6)
# not all civil case cover sheets have 6 pages, which means they might be missing the address in that document, so test for that
if len(images) == 0:
raise Exception(
'civil case cover sheet does not have 6 pages as expected; likely missing address'
)
last_page_image = images[-1]
return last_page_image
def address_autocrop(last_page_image):
""" Automatically detect the boxes for each field in the address page of
the civil case cover sheet.
last_page_image: image object containing the last page of the civil
case cover sheet. can be various types, such as PIL image.
Returns: tuple of 4 np arrays containing the images of the cropped
boxes for the address, city, state, and zip code
"""
last_page_image = np.array(last_page_image)
# crop the last page to the top half since the file is large and makes transformations slow
# also crop a bit off the top since there is a box that sometimes gets confused for the address box
top_crop = 300
half_height = int(0.5 * last_page_image.shape[0])
last_page_image = last_page_image[top_crop:half_height, :]
## create the pipeline configs for the box detectors for the street address box, city box, and state/zip boxes ##
ADDRESS_CFG = config.PipelinesConfig()
# important to adjust these values to match the size of boxes on your image
ADDRESS_CFG.width_range = (800, 1500)
ADDRESS_CFG.height_range = (150, 500)
# there are some as short as 97, but reducing the range this much makes it detect the wrong box more often than not, I think
# the more scaling factors the more accurate the results but also it takes more time to processing
# too small scaling factor may cause false positives
# too big scaling factor will take a lot of processing time
# resizes image based on scaling factor
ADDRESS_CFG.scaling_factors = [0.7]
# w/h ratio range for boxes/rectangles filtering
ADDRESS_CFG.wh_ratio_range = (0.5, 12)
# group_size_range starting from 2 will skip all the groups
# with a single box detected inside (like checkboxes)
ADDRESS_CFG.group_size_range = (2, 100)
# num of iterations when running dilation tranformation (to engance the image)
ADDRESS_CFG.dilation_iterations = 5
CITY_CFG = config.PipelinesConfig()
CITY_CFG.width_range = (250, 850)
CITY_CFG.height_range = (90, 210)
CITY_CFG.scaling_factors = [0.7]
CITY_CFG.wh_ratio_range = (1.5, 7.0)
CITY_CFG.group_size_range = (2, 100)
CITY_CFG.dilation_iterations = 2
STATEZIP_CFG = config.PipelinesConfig()
STATEZIP_CFG.width_range = (150, 450)
STATEZIP_CFG.height_range = (90, 210)
STATEZIP_CFG.scaling_factors = [0.7]
STATEZIP_CFG.wh_ratio_range = (0.6, 5)
STATEZIP_CFG.group_size_range = (2, 100)
STATEZIP_CFG.dilation_iterations = 2
## try to locate the field boxes now ##
# for the street address, crop to the right side of the page so we remove the REASON box
right_crop = 1000
last_page_image_right = last_page_image[:, right_crop:]
street_address_bbox_candidates, _, _, _ = get_boxes(last_page_image_right,
cfg=ADDRESS_CFG,
plot=False)
# if no boxes detected, its likely because the scan was rotated. the box detection algorithm is super sensitive to angle (i.e. within 0.2 degrees), so we rotate the image +- 0.1 degrees until we can detect boxes
max_rot = 2
if len(street_address_bbox_candidates) == 0:
deg = 0
print(
'could not find street address box, testing different image rotations now'
)
while len(street_address_bbox_candidates) == 0 and abs(deg) < max_rot:
# rotate +0.1 deg
deg = abs(deg) + 0.1
rotated = ndimage.rotate(last_page_image_right, deg, reshape=False)
# try box detection again
street_address_bbox_candidates, _, _, _ = get_boxes(
rotated, cfg=ADDRESS_CFG, plot=False)
if len(street_address_bbox_candidates) != 0:
print(f'box detected successfully at {deg} degree rotation')
break
# if that doesn't work, rotate in the opposite direction
deg = -deg
rotated = ndimage.rotate(last_page_image_right, deg, reshape=False)
# try box detection again
street_address_bbox_candidates, _, _, _ = get_boxes(
rotated, cfg=ADDRESS_CFG, plot=False)
if len(street_address_bbox_candidates) != 0:
print(f'box detected successfully at {deg} degree rotation')
break
# verify if the street_address_bbox_candidates has valid boxes now or if its still empty
if len(street_address_bbox_candidates) == 0:
raise Exception(
f'could not detect address box within +-{max_rot} degrees of rotation'
)
city_bbox_candidates, _, _, _ = get_boxes(last_page_image,
cfg=CITY_CFG,
plot=False)
# we have to repeat the rotation stuff because it's possible that city/state/zip will be recognized at different rotations from street address
if len(city_bbox_candidates) == 0:
deg = 0
print('could not find city box, testing different image rotations now')
while len(city_bbox_candidates) == 0 and abs(deg) < max_rot:
# rotate +0.1 deg
deg = abs(deg) + 0.1
rotated = ndimage.rotate(last_page_image, deg, reshape=False)
# try box detection again
city_bbox_candidates, _, _, _ = get_boxes(rotated,
cfg=CITY_CFG,
plot=False)
if len(city_bbox_candidates) != 0:
print(f'box detected successfully at {deg} degree rotation')
break
# if that doesn't work, rotate in the opposite direction
deg = -deg
rotated = ndimage.rotate(last_page_image, deg, reshape=False)
# try box detection again
city_bbox_candidates, _, _, _ = get_boxes(rotated,
cfg=CITY_CFG,
plot=False)
if len(city_bbox_candidates) != 0:
print(f'box detected successfully at {deg} degree rotation')
break
# verify if the city_bbox_candidates has valid boxes or if its empty
if len(city_bbox_candidates) == 0:
raise Exception(
f'could not detect address box within +-{max_rot} degrees of rotation'
)
state_zip_bbox_candidates, _, _, _ = get_boxes(last_page_image,
cfg=STATEZIP_CFG,
plot=False)
if len(state_zip_bbox_candidates) < 2:
deg = 0
print(
'could not find 2 state/zip boxes, testing different image rotations now'
)
while len(state_zip_bbox_candidates) < 2 and abs(deg) < max_rot:
# rotate +0.1 deg
deg = abs(deg) + 0.1
rotated = ndimage.rotate(last_page_image, deg, reshape=False)
# try box detection again
state_zip_bbox_candidates, _, _, _ = get_boxes(rotated,
cfg=STATEZIP_CFG,
plot=False)
if len(state_zip_bbox_candidates) >= 2:
print(f'boxes detected successfully at {deg} degree rotation')
break
# if that doesn't work, rotate in the opposite direction
deg = -deg
rotated = ndimage.rotate(last_page_image, deg, reshape=False)
# try box detection again
state_zip_bbox_candidates, _, _, _ = get_boxes(rotated,
cfg=STATEZIP_CFG,
plot=False)
if len(state_zip_bbox_candidates) >= 2:
print(f'box detected successfully at {deg} degree rotation')
break
# verify if the state_zip_bbox_candidates has valid boxes or if its empty
if len(state_zip_bbox_candidates) < 2:
raise Exception(
f'fewer than 2 boxes found for State and Zip Code fields after rotation corrections of +-{max_rot} degrees; aborting'
)
# the format of the bbox coordinates is [left, top, width, height]
# extract bounding box for street address
if street_address_bbox_candidates.shape[0] == 1:
# only one street address box found, use it
street_address_bbox = street_address_bbox_candidates[0]
else:
# TODO just check the boxes for text matching ADDRESS – and do that earlier when rotating degrees in case we detect the empty box under address but not the actual address box itself (happens for some cases)
# take the rightmost box, since we likely also got the Reason box
street_address_bbox = street_address_bbox_candidates[0]
for i in range(1, street_address_bbox_candidates.shape[0]):
left_coord = street_address_bbox_candidates[i][0]
if left_coord > street_address_bbox[0]:
street_address_bbox = street_address_bbox_candidates[i]
# extract bounding box for city
if city_bbox_candidates.shape[0] == 1:
# only one city box found, use it
city_bbox = city_bbox_candidates[0]
else:
# take the leftmost box, since we likely also got the State box
city_bbox = city_bbox_candidates[0]
for i in range(1, city_bbox_candidates.shape[0]):
left_coord = city_bbox_candidates[i][0]
if left_coord < city_bbox[0]:
city_bbox = city_bbox_candidates[i]
# extract bounding box for state and zip
if state_zip_bbox_candidates.shape[0] > 2:
# more than two boxes found
# filter by general horizontal position and remove the boxes we think are incorrect
# horizontally the top should be somewhere below 750px and the bottom should be somewhere above 1600px
state_zip_bbox_candidates_orig = state_zip_bbox_candidates
state_zip_bbox_candidates = []
for bbox in state_zip_bbox_candidates_orig:
top = bbox[1]
bottom = bbox[1] + bbox[3]
if top > 650 and bottom < 1700:
state_zip_bbox_candidates.append(bbox)
state_zip_bbox_candidates = np.array(state_zip_bbox_candidates)
# by the end of this filtering process we should hopefully have exactly 2 boxes left
if state_zip_bbox_candidates.shape[0] != 2:
raise Exception(
'after filtering out possible wrong state/zip boxes, fewer than 2 boxes remaining; aborting'
)
# we should be left with exactly two boxes now
# should be state and zip, state is on the left
if state_zip_bbox_candidates[0][0] < state_zip_bbox_candidates[1][0]:
state_bbox = state_zip_bbox_candidates[0]
zip_bbox = state_zip_bbox_candidates[1]
else:
state_bbox = state_zip_bbox_candidates[1]
zip_bbox = state_zip_bbox_candidates[0]
# make the crops
# and add a little bit of padding around the crops in case the auto bbox was too tight (the OCR doesn't like very tight crops)
address_crop = last_page_image_right[street_address_bbox[1] -
30:street_address_bbox[1] +
street_address_bbox[3] + 30,
street_address_bbox[0] -
30:street_address_bbox[0] +
street_address_bbox[2] + 30]
city_crop = last_page_image[city_bbox[1] - 10:city_bbox[1] + city_bbox[3] +
10, city_bbox[0] - 50:city_bbox[0] +
city_bbox[2] + 10]
state_crop = last_page_image[state_bbox[1] - 10:state_bbox[1] +
state_bbox[3] + 10, state_bbox[0] -
10:state_bbox[0] + state_bbox[2] + 10]
# add extra padding to the bottom of the zip code bbox since sometimes the zip code is 2 lines long and exceeds the bounds
zip_crop = last_page_image[zip_bbox[1] - 10:zip_bbox[1] + zip_bbox[3] + 40,
zip_bbox[0] - 10:zip_bbox[0] + zip_bbox[2] + 10]
return address_crop, city_crop, state_crop, zip_crop
def address_from_crops(address_crop,
city_crop,
state_crop,
zip_crop,
verbose=True):
""" Extract text using OCR from the cropped boxes around each address field
on the civil case cover sheet.
<address/city/state/zip>_crop: image cropped around the respective
address field
verbose (bool): whether or not to print the retrieved address fields
Returns: tuple of strings containing street address, city, state, and zip code, if extraction was successful
"""
# run OCR on the crops
tesseract_config = f"--oem 1 --dpi {DPI}"
# NOTE the tesseract characte whitelist only works for the legacy mode, not the newer neural net/LSTM mode, which apparently doesn't respect the whitelist - so we can't really force it to exclude weird punctuation or diacritics
# (https://stackoverflow.com/a/49030935/10536083)
streetaddress_texts = pytesseract.image_to_string(address_crop,
config=tesseract_config)
city_texts = pytesseract.image_to_string(city_crop,
config=tesseract_config)
state_texts = pytesseract.image_to_string(state_crop,
config=tesseract_config)
zip_texts = pytesseract.image_to_string(zip_crop, config=tesseract_config)
#### extract the address info from the text blocks ####
# drop the text containing "CITY"/"ADDRESS"/etc, remove punctuation, strip whitespace
# TODO filter out artifacts of OCR, like random characters like -_:;'" (note we can't filter out periods or commas because those could be valid instances in the address) (usually commas for separating street/apt, and periods following the street abbreviation)
## extract street address ##
# strip whitespace so that it should start with 'ADDRESS'
streetaddress_texts = streetaddress_texts.strip()
address_start_idx = streetaddress_texts.upper().find('ADDRESS')
# also check for typos of ADDRESS
# TODO create a more flexible way of checking for typos using edit distance
if address_start_idx == -1:
address_start_idx = streetaddress_texts.upper().find('ADORESS')
if address_start_idx == -1:
address_start_idx = streetaddress_texts.upper().find('AOORESS')
if address_start_idx == -1:
address_start_idx = streetaddress_texts.upper().find('AODRESS')
assert address_start_idx != -1, f'the street address block should start with ADDRESS but could not find that word in: {streetaddress_texts}'
# remove the starting text corresponding to 'ADDRESS'
streetaddress = streetaddress_texts[address_start_idx + len('ADDRESS'):]
# remove the semicolon, or similar punctuation that the OCR might've misinterpreted for the semicolon. sometimes it doesn't catch the semicolon so we also need to check for that
streetaddress = streetaddress.strip()
if streetaddress[0] in [':', ';', ',', '.', '\'']:
streetaddress = streetaddress[1:]
# strip any remaining whitespace
streetaddress = streetaddress.strip()
# filter out defendant name (assuming name will only ever contain alphabetic characters, and that the street address always starts with numbers for the building)
name_filter = "[a-zA-Z\s]*(\d.*)"
streetaddress = re.findall(name_filter, streetaddress)[0]
streetaddress = streetaddress.strip()
## extract city ##
# strip whitespace so that it should start with 'CITY'
city_texts = city_texts.strip()
city_start_idx = city_texts.upper().find('CITY')
# also check for typos of CITY (I've seen CHY, OI, Ciry)
if city_start_idx == -1:
city_start_idx = city_texts.upper().find('CHY')
if city_start_idx == -1:
city_start_idx = city_texts.upper().find('CIRY')
assert city_start_idx != -1, f'the city block should start with CITY but could not find that word in: {city_texts}'
# remove the starting text corresponding to 'CITY'
city = city_texts[city_start_idx + len('CITY'):]
city = city.strip()
# remove the semicolon, or similar punctuation that the OCR might've misinterpreted for the semicolon. sometimes it doesn't catch the semicolon so we also need to check for that
if city[0] in [':', ';', ',', '.', '\'']:
city = city[1:]
# strip any remaining whitespace
city = city.strip()
## extract state ##
# strip whitespace so that it should start with 'STATE'
state_texts = state_texts.strip()
state_start_idx = state_texts.upper().find('STATE')
assert state_start_idx != -1, f'the state block should start with STATE but could not find that word in: {state_texts}'
# remove the starting text corresponding to 'STATE'
state = state_texts[state_start_idx + len('STATE'):]
state = state.strip()
# remove the semicolon, or similar punctuation that the OCR might've misinterpreted for the semicolon. sometimes it doesn't catch the semicolon so we also need to check for that
if state[0] in [':', ';', ',', '.', '\'']:
state = state[1:]
# strip any remaining whitespace
state = state.strip()
## extract zip code ##
# strip whitespace so that it should start with 'ZIP'
zip_texts = zip_texts.strip()
zip_start_idx = zip_texts.upper().find('ZIP')
# also check for typos of ZIP
if zip_start_idx == -1:
zip_start_idx = zip_texts.upper().find('21P')
assert zip_start_idx != -1, f'the zip block should start with ZIP but could not find that word in: {zip_texts}'
# remove the starting text corresponding to 'ZIP'
zip_texts = zip_texts[zip_start_idx + len('ZIP'):]
# it's posible the space between ZIP and CODE didn't get recognized properly, so we remove the word CODE in a separate step
# strip again to get rid of whitespace before CODE
zip_texts = zip_texts.strip()
code_start_idx = zip_texts.upper().find('CODE')
assert code_start_idx != -1, f'the zip block should have CODE as the second word but could not find that word in: {zip_texts}'
# remove the starting text corresponding to 'CODE'
zip_texts = zip_texts[code_start_idx + len('CODE'):]
zip_texts = zip_texts.strip()
# remove the semicolon, or similar punctuation that the OCR might've misinterpreted for the semicolon. sometimes it doesn't catch the semicolon so we also need to check for that
if zip_texts[0] in [':', ';', ',', '.', '\'']:
zip_texts = zip_texts[1:]
# strip any remaining whitespace
zip_texts = zip_texts.strip()
# regex to parse the zip code, because sometimes the second section (+4 digits) of the zip code gets botched by OCR, or is not included
zipcode_5 = zip_texts[:5]
# ^ this is the first part of the zip code with 5 digits, which they should all have
# TODO check this since it seemed to have wrongly let through a 4 digit code
assert zipcode_5.isdigit(
), f"the first 5 characters of the zip code should be all digits, but instead found: {zipcode_5}"
# then check if we have a valid second part of the zip code with 4 digits (this is optional so if we can't find a valid version, we just drop it)
# sometimes we don't have anything, so we have to catch that case too
zip_texts = zip_texts[5:].strip()
zipcode = zipcode_5
# TODO figure out a better way to distinguish valid +4 zip codes from gibberish (which happens often when the text overlaps the horizontal bottom line of the box and the OCR gets confused by the strikethrough)
# for now, we just ignore the +4 zip code, since that shouldn't be necessary for geocoding when we have the rest of the address
# if len(zip_texts) > 0:
# # drop the hyphen if we find one (sometimes there isn't)
# if zip_texts[0] in ['-', '_', '–']:
# zip_texts = zip_texts[1:]
# # then get the next 4 characters and check if they are digitis
# zip_texts = zip_texts.strip()
# zipcode_4 = zip_texts[:4]
# if zipcode_4.isdigit():
# zipcode = zipcode_5 + '-' + zipcode_4
# # if not, we just ignore, there was probably an OCR error and the zip+4 isn't required to geolocate the address
# display the original scans and extracted address in case we want to verify results
if verbose:
print("extracted address")
print("street address: ", streetaddress)
print("city: ", city)
print("state: ", state)
print("zipcode: ", zipcode)
return streetaddress, city, state, zipcode
# TODO replace asserts with if raises
def extract_address(case_number,
file_dir,
view_scans=False,
print_address=True):
""" Extract the address from the civil case cover sheet for a single case.
Args:
case_number (str): case identifier, alphanumeric
file_dir (str): path to the folder containg all the scanned legal
documents
view_scans (bool): whether or not to plot the address image crops
(for debugging/verification)
print_address (bool): whether or not to print the extracted address
(for debugging/verification)
Returns: tuple of strings containing street address, city, state, and
zip code, if extraction was successful
"""
last_page_image = cover_sheet_last_page_image(case_number, file_dir)
address_crop, city_crop, state_crop, zip_crop = address_autocrop(
last_page_image)
# display the cropped scans in case we want to verify results
if view_scans:
print("address autocrops")
fig, axs = plt.subplots(1, 4, figsize=(15, 2))
axs = axs.flatten()
axs[0].imshow(address_crop)
axs[1].imshow(city_crop)
axs[2].imshow(state_crop)
axs[3].imshow(zip_crop)
axs[0].set_title('Street address crop')
axs[1].set_title('City crop')
axs[2].set_title('State crop')
axs[3].set_title('Zip code crop')
axs[0].axis('off')
axs[1].axis('off')
axs[2].axis('off')
axs[3].axis('off')
streetaddress, city, state, zipcode = address_from_crops(address_crop,
city_crop,
state_crop,
zip_crop,
verbose=False)
# display the extracted address in case we want to verify results
if print_address:
print("extracted address")
print("street address: ", streetaddress)
print("city: ", city)
print("state: ", state)
print("zipcode: ", zipcode)
return streetaddress, city, state, zipcode
def extract_init_demand(case_number, file_dir, verbose=False):
""" Extract the initial demand from the complaint for a single case.
Args:
case_number (str): case identifier, alphanumeric
file_dir (str): path to the folder containg all the scanned legal
documents
verbose (bool): whether or not to print extra details (for
debugging/verification)
Returns: float value of the initial demand, as well as page the value was found on (for debugging purposes)
"""
fpaths = get_file(case_number, 'complaint', file_dir)
if len(fpaths) == 0:
raise Exception(f'could not complaint for case {case_number}')
elif len(fpaths) > 1:
raise Exception(
f'found {len(fpaths)} complaints for case {case_number}: {fpaths}')
# initial demand should be on the first, second, or third page of the complaint
images = convert_from_path(fpaths[0], dpi=DPI, last_page=3)
# verify that we have at least 2 pages from the doc (its often ok if we don't have 3rd)
if len(images) < 2:
raise Exception(
'complaint has fewer than 2 pages, may be missing initial demand')
# # TODO we should still try checking first page even if we don't have second page. but tbh i think it'll be very unlikely we have less than 2 pages
# it makes things a lot faster to crop the first page to the right half of the page; however, we have to be kind of careful with cropping because if we crop too far, we'll lose the key text, but if we don't crop far right enough, our OCR could give us some random letters from the left half of the page in between the DEMAND and monetary value
# for now, we just don't crop horizontally
# crop the first page to remove the top and bottom quarter of the page, to speed up processing
first_page_image = np.array(images[0])[800:-800, :]
# first_page_image = np.array(images[0])[800:-800, 1300:] # ignore horizontal crop for now
# crop the second page to the bottom half of the page where section 10 is, to speed up processing
second_page_image = np.array(images[1])[1500:, :]
if len(images) >= 3:
third_page_image = np.array(images[2])
else:
third_page_image = None
# run OCR
tesseract_config = f"--oem 1 --dpi {DPI}"
# NOTE the tesseract characte whitelist only works for the legacy mode, not the newer neural net/LSTM mode, which apparently doesn't respect the whitelist
first_page_text = pytesseract.image_to_string(first_page_image,
config=tesseract_config)
second_page_text = pytesseract.image_to_string(second_page_image,
config=tesseract_config)
if third_page_image is not None:
third_page_text = pytesseract.image_to_string(third_page_image,
config=tesseract_config)
else:
third_page_text = ''
# remove all newlines from the text and replace with space because newlines are slightly annoying to handle using regex (they aren't included by the \s whitespace character)
first_page_text = first_page_text.replace('\n', ' ')
second_page_text = second_page_text.replace('\n', ' ')
third_page_text = third_page_text.replace('\n', ' ')
# check which version of the complaint form is used by searching for different variations of the text:
# * page 1:
# * DEMAND: $XXXXX.XX
# * DEMAND AMOUNT: $XXXXX.XX
# * AMOUNT OF DEMAND: $XX,XXX.XX
# * AMOUNT DEMANDED: $XX,XXX.XX
# * Demand is for $XXXXX.XX
# * PRAYER AMOUNT: $XXXXX.XX
# * PRAYER AMT: $XXXXX.XX
# * LIMITED CIVIL: $XXXXX.XX
# * page 2:
# * 10. Plaintiff prays for judgment for costs of suit; for such
# relief as is fair, just, and equitable; and for a. damages of:
# $XXXXX.XX
# * page 3:
# * WHEREFORE, as to all Causes of Action, Plaintiff prays for
# judgment against Defendant, including but not limited to, the
# amounts as follows: For damages of $XXXXX.XX;
# NOTE: we assume between 3-5 digits on the left integer side of the decimal (since should be <$25,000 and plaintiffs probably won't sue if it's <$100)
# use regex to extract monetary amount
prayer_amount_regex = "[pP][rR][aA][yY][eE][rR]\s*[aA][mM][oO][uU][nN][tT]\s*[:;,.-]?\s*[$Ss]\s*(\d{0,2}[,.]?\d{0,3}[.,]?\d{2})"
prayer_amt_regex = "[pP][rR][aA][yY][eE][rR]\s*[aA][mM][tT]\s*[:;,.-]?\s*[$Ss]\s*(\d{0,2}[,.]?\d{0,3}[.,]?\d{2})"
demand_regex = "[dD][eE][mM][aA][nN][dD]\s*[:;,.-]?\s*[$Ss]\s*(\d{0,2}[,.]?\d{0,3}[.,]?\d{2})"
demand_amount_regex = "[dD][eE][mM][aA][nN][dD]\s*[aA][mM][oO][uU][nN][tT]\s*[:;,.-]?\s*[$Ss]\s*(\d{0,2}[,.]?\d{0,3}[.,]?\d{2})"
amount_demanded_regex = "[aA][mM][oO][uU][nN][tT]\s*[dD][eE][mM][aA][nN][dD][eE][dD]\s*[:;,.-]?\s*[$Ss]\s*(\d{0,2}[,.]?\d{0,3}[.,]?\d{2})"
demand_is_for_regex = "[dD][eE][mM][aA][nN][dD]\s*[iI][sS]\s*[fF][oO][rR]\s*[:;,.-]?\s*[$Ss]\s*(\d{0,2}[,.]?\d{0,3}[.,]?\d{2})"
limited_civil_regex = "[lLiI][iIl][mM][iIl][tT][eE][dD]\s*[cC][iIl][vV][iIl][lLiI]\s*[:;,.-]?\s*[$Ss]\s*(\d{0,2}[,.]?\d{0,3}[.,]?\d{2})"
damages_of_regex = "damages\s*of\s*[:;,.-]?\s*[$Ss]\s*[$]?\s*(\d{0,2}[,.]?\d{0,3}[.,]?\d{2})" # sometimes there are sheets with 2 dollar signs..
# TODO figure out a way to make these more robust to OCR typos
# note sometimes the decimal point in the monetary value doesn't get detected by OCR, hence why we make them optional in the regex pattern
prayer_amount_results = re.findall(prayer_amount_regex, first_page_text)
prayer_amt_results = re.findall(prayer_amt_regex, first_page_text)
demand_results = re.findall(demand_regex, first_page_text)
demand_amount_results = re.findall(demand_amount_regex, first_page_text)
amount_demanded_results = re.findall(amount_demanded_regex,
first_page_text)
demand_is_for_results = re.findall(demand_is_for_regex, first_page_text)
limited_civil_results = re.findall(limited_civil_regex, first_page_text)
damages_of_results_p2 = re.findall(damages_of_regex, second_page_text)
damages_of_results_p3 = re.findall(damages_of_regex, third_page_text)
# convert to value
demands_found = prayer_amount_results + prayer_amt_results + demand_results + demand_amount_results + amount_demanded_results + demand_is_for_results + limited_civil_results + damages_of_results_p2 + damages_of_results_p3
if len(demands_found) == 0:
raise Exception(
'could not find initial demand on first 3 pages; aborting')
if len(demands_found) > 1:
print('found multiple instances of initial demand, which is weird')
# if we somehow find multiple demands, just take the first one for now
# TODO add better handling later
if len(prayer_amount_results) > 0:
found_on = 'page 1 (as PRAYER AMOUNT)'
if len(prayer_amt_results) > 0:
found_on = 'page 1 (as PRAYER AMT)'
if len(demand_results) > 0:
found_on = 'page 1 (as DEMAND)'
if len(demand_amount_results) > 0:
found_on = 'page 1 (as DEMAND AMOUNT)'
if len(amount_demanded_results) > 0:
found_on = 'page 1 (as AMOUNT DEMANDED)'
if len(demand_is_for_results) > 0:
found_on = 'page 1 (as Demand is for)'
if len(limited_civil_results) > 0:
found_on = 'page 1 (as LIMITED CIVIL)'
if len(damages_of_results_p2) > 0:
found_on = 'page 2 (as damages of)'
if len(damages_of_results_p3) > 0:
found_on = 'page 3 (as damages of)'
if verbose:
print(f'found initial demand on {found_on}')
# convert to float
# note we need to remove any commas before converting to float. however, it's possible that the decimal point will get interpreted as a comma, or won't get detected at all by the OCR. thus, let's just remove all punctuation, and then re insert it back in, assuming the demands always have a decimal value (TODO we should verify this assumption)
init_demand_str = demands_found[0]
init_demand_str_no_punc = re.sub('[.,]', '', init_demand_str)
init_demand = float(init_demand_str_no_punc) / 100
if init_demand > 25_000:
raise Warning(
'detected initial demand is greater than $25,000, is this expected?'
)
return init_demand, found_on
def extract_all_addresses(input_csv_path, file_dir, output_csv_path):
""" Extract the address from the civil case cover sheet for all cases in a
csv.
Args:
input_csv_path (str): path to csv containing case entries (with all
the assigned case numbers, file names, etc but presumably no
addresses)
file_dir (str): path to the folder containg all the scanned legal
documents
output_csv_path (str): path to which updated csv with addresses
will be saved
Output: saves new csv.
"""
# load the csv as a pandas dataframe
df = pd.read_csv(input_csv_path)
# iterate over case number
error_count = 0
for i, case_id in enumerate(df['case_number'].unique()):
print(i + 1, ":", case_id)
try:
streetaddress, city, state, zipcode = extract_address(
case_id, file_dir, view_scans=False, print_address=True)
# add to dataframe
df.loc[(df['case_number'] == case_id) &
(df['Document'] == 'Civil Case Cover Sheet'),
'address'] = (streetaddress + ", " + city + " " + state +
" " + zipcode)
# add additional note columns so we know if it was automated/if
# there were any issues
df.loc[(df['case_number'] == case_id) &
(df['Document'] == 'Civil Case Cover Sheet'),
'automated address'] = 'passed'
df.loc[(df['case_number'] == case_id) &
(df['Document'] == 'Civil Case Cover Sheet'),
'automated address error'] = ''
print()
except Exception as e:
df.loc[(df['case_number'] == case_id) &
(df['Document'] == 'Civil Case Cover Sheet'),
'automated address'] = 'failed'
df.loc[(df['case_number'] == case_id) &
(df['Document'] == 'Civil Case Cover Sheet'),
'automated address error'] = e
error_count += 1
print(e)
print(f'errors: {error_count}/{i+1}')
print()
# save to a new csv
df.to_csv(output_csv_path)
def extract_all_init_demands(input_csv_path, file_dir, output_csv_path):
""" Extract the initial demand from the complaint for all cases in a csv.
Args:
input_csv_path (str): path to csv containing case entries (with all
the assigned case numbers, file names, etc but presumably no
addresses)
file_dir (str): path to the folder containg all the scanned legal
documents
output_csv_path (str): path to which updated csv with addresses
will be saved
Output: saves new csv.
"""
# load the csv as a pandas dataframe
df = pd.read_csv(input_csv_path)
# iterate over case number
error_count = 0
for i, case_id in enumerate(df['case_number'].unique()):
print(i + 1, ":", case_id)
try:
init_demand, found_on = extract_init_demand(case_id, file_dir)
print('init_demand', init_demand)
# add to dataframe
df.loc[(df['case_number'] == case_id) &
(df['Document'] == 'Complaint'),
'initial demand amount'] = init_demand
# add additional note columns so we know if it was automated/if
# there were any issues
df.loc[(df['case_number'] == case_id) &
(df['Document'] == 'Complaint'),
'automated initial demand'] = 'passed'
df.loc[(df['case_number'] == case_id) &
(df['Document'] == 'Complaint'),
'automated initial demand error'] = ''
print()
except Exception as e:
df.loc[(df['case_number'] == case_id) &
(df['Document'] == 'Complaint'),
'automated initial demand'] = 'failed'
df.loc[(df['case_number'] == case_id) &
(df['Document'] == 'Complaint'),
'automated initial demand error'] = e
error_count += 1
print(e)
print(f'errors: {error_count}/{i+1}')
print()
# save to a new csv
df.to_csv(output_csv_path)