-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathvanilla.sh
111 lines (90 loc) · 2.83 KB
/
vanilla.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#!/bin/zsh
#CardsAgainstHumanity has the text encoded as a vector instead of as text.
#Extracts the text from CardsAgainstHumanity.pdf
#Creates White-ocrad.txt, White-tess.txt, Black-tess.txt, and Black-ocrad.txt indicating card color & OCR engine.
#the best OCRed version should be chosen and cleaned, and probably spell checked.
croppdf (){
#http://stackoverflow.com/questions/6183479/cropping-a-pdf-using-ghostscript-9-01
#72 pts / in, gs does everthing in points
local in=$1
local out=$2
local l=`printf %d $(($3 * 72))`
local b=`printf %d $(($4 * 72))`
local r=`printf %d $(($5 * 72))`
local t=`printf %d $(($6 * 72))`
local width=`printf %d $(($7 * 72))`
local height=`printf %d $(($7 * 72))`
#gs \
#-o $out \
#-sDEVICE=pdfwrite \
#-dDEVICEWIDTHPOINTS=$width \
#-dDEVICEHEIGHTPOINTS=$height \
#-dFIXEDMEDIA \
#-c "$l $b translate 0 0 $(($width)) $(($height)) rectclip" \
#-f $in
#echo gs \
#-o $out \
#-sDEVICE=pdfwrite \
#-c "[/CropBox [$l $b $r $t] /PAGES pdfmark" \
#-f $in
pdfmanipulate crop -o $out -y $b -x $l -v $r -w $t $in
}
#Split White Cards From Black Cards
rm -f BlackCards{,-nocrop}.p{df,bm} WhiteCards{,-nocrop}.p{df,bm} AllCards.pbm
rm -f card-images/*.pbm
stapler sel CAH_MainGame.pdf 2-24 WhiteCards-nocrop.pdf
stapler sel CAH_MainGame.pdf 25-29 BlackCards-nocrop.pdf
#crop out margins
croppdf WhiteCards-nocrop.pdf WhiteCards.pdf .25 .5 .25 .5 8 10
croppdf BlackCards-nocrop.pdf BlackCards.pdf .25 .5 .25 .5 8 10
#convert to b/w bitmap (pbm format since that's what ocrad supports and they're pretty small since b/w only)
convert -append -density 300x300 WhiteCards.pdf WhiteCards.pbm
convert -append -density 300x300 -negate BlackCards.pdf BlackCards.pbm
#Step 2: run ocr in parts
# pages are 2550x4125 px and 4x5 cards (20 cards per page)
# cards are h=825 w=637
mkdir -p card-images
rm -f White-tess.txt Black-tess.txt
rm -f White-ocrad.txt Black-ocrad.txt
extractpage(){
local shade=$1
local page=$2
local row=$3
local col=$4
img="card-images/$shade-$page-$row-$col.pbm"
echo $img
local y=$((3000 * $page + 600 * $row + 10))
local x=$((600 * $col + 10))
#echo "x=$x,y=$y"
#page width = 2400
#page height = 3000
#card width = 600
#card height = 600
#local geo=600x600+${x}+${y}
local geo=590x480+${x}+${y}
echo "geo=$geo"
convert -crop $geo ${shade}Cards.pbm $img
tesseract $img $img:r-tess
ocrad $img > $img:r-ocrad.txt
cat $img:r-tess.txt | tr '\n' ' ' >> $shade-tess.txt
cat $img:r-ocrad.txt | tr '\n' ' ' >> $shade-ocrad.txt
echo >> $shade-tess.txt
echo >> $shade-ocrad.txt
}
for shade in White Black; do
if [ $shade = 'White' ]; then
maxpages=22
elif [ $shade = 'Black' ]; then
maxpages=4
else
echo 'WTF!'
exit
fi
for page in {0..$maxpages}; do
for row in {0..4}; do
for col in {0..3}; do
extractpage $shade $page $row $col
done
done
done
done