-
Notifications
You must be signed in to change notification settings - Fork 90
/
Copy pathocr
executable file
·46 lines (37 loc) · 1.09 KB
/
ocr
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
#!/usr/bin/perl
#
# OCR and image (extract text) from a file/url
# - wrapper for tesseract and imagemagick
#
# -samy kamkar 2020/12/06
use strict;
use y;;;
die "usage: $0 <path/url to image>\n" unless @ARGV == 1;
my $TESSERACT = "tesseract"; # ocr engine
my $WGET = "wget";
my $CONVERT = "convert"; # imagemagick
my @convopts = ('-draw', 'color 0,0 floodfill', qw/-fuzz 50% -fill white -opaque white -bordercolor black -border 2 -fill white -alpha off -negate -units pixelsperinch -density 72/);
my $DIR = "/tmp";
my $file = shift;
my $rand = time();
if (!-e $file)
{
my $nf = "$DIR/.ocr.get.$rand";
runv($WGET, $file, "-O", $nf);
$file = $nf;
}
my $img = "$DIR/.ocr.new.$rand.png";
runv($CONVERT, $file, @convopts, $img);
#system("open", $img);
# tesseract needs to operate in the directory for some reason?
# otherwise get no such file (in a specific version)
chdir($DIR);
# run tesseract and output to file, it adds .txt to end
my $out = "$DIR/.ocr.out.$rand";
runv($TESSERACT, $img, $out);
$out .= ".txt";
# read our file
print cat($out);
#open(OUT, "<$out") || die $!;
#print while <OUT>;
#close(OUT);