-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrecup_data.bash
executable file
·255 lines (217 loc) · 9.24 KB
/
recup_data.bash
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
#!/bin/bash
get_html() {
# Return : 0 if ok, else if other
# Detail : Download html website in html-base and fill data.txt
id=1 # pokemon identiant of national pokedex
gen=1 # number of the genertaion of the pokemon
idGen=1 # pokemon identifiant of the current generation
pokemonByGen="151 100 135 107 156 72 88 89" # number of pokemons by generation
have_data=$(test -f data.txt; echo $?)
for nbPokemons in $pokemonByGen
do
while [ $idGen -le $nbPokemons ]
do
if [ ! $have_data -eq 0 ]
then
echo get name of pokemon $id
name=`curl -Ls -o /dev/null -w %{url_effective} https://www.pokemon.com/us/pokedex/$id | cut -d'/' -f 6`;
echo $id $name $gen >> data.txt
fi
if [ ! -f html-base/$id ]
then
echo get https://www.pokemon.com/us/pokedex/$id
wget -O html-base/$id https://www.pokemon.com/us/pokedex/$id
fi
((id=id+1))
((idGen+=1))
done
echo end gen $gen
((idGen=1))
((gen+=1))
done
}
get_data (){
# Arg : pokeId (int)
# return : 0 if ok, else if other
# Detail : Get datas of a pokeId
get_type(){
# Arg : pokeId (int)
# return : 0 if ok, else if other
# Detail : Get type of a pokeId
pokeId=$1
nblines=$(cat html-base/$pokeId | wc -l)
for i in $(seq 1 $nblines)
do
line=$(head -n $i html-base/$pokeId | tail -n 1)
isTypeDiv=$(echo $line | grep '<div class="dtm-type">' | wc -l) # O = no; 1 = yes
if [ $isTypeDiv -eq 1 ]
then
res=""
for i in $(seq $i $nblines)
do
line=$(head -n $i html-base/$pokeId | tail -n 1)
isEndDiv=$(echo $line | grep '</div>' | wc -l) # O = no; 1 = yes
if [ $isEndDiv -eq 1 ]
then
break
else
isType=$(echo $line | grep '<a href="/us/pokedex/?type=.*">.*</a>' | wc -l) # O = no; 1 = yes
if [ $isType -eq 1 ]
then
res+=$(echo $line | grep '<a href="/us/pokedex/?type=.*">.*</a>' | sed -E 's/^[^<]*<a href="\/us\/pokedex\/\?type=[[:alpha:]]+">([^<]+)<\/a>/\1/g' )
res+=" "
fi
fi
done
echo $res
fi
done
}
for pokeId in $(seq 1 $(ls -w 1 html-base | wc -l))
do
if [ ! -f data/$pokeId ]
then
echo create data/$pokeId - $(head -n $pokeId data.txt | tail -n 1 | cut -d' ' -f2)
head -n $pokeId data.txt |tail -n 1 > data/$pokeId
# Correct a bug of end of file of data.txt ; HotFix
if [ $pokeId -eq 898 ]
then
echo '' >> data/898
fi
# If there is only one forme :
if [ $(cat html-base/$pokeId | grep '<select id="formes" name="formes"' | wc -l) -eq 0 ]
then
nbline=$(wc -l html-base/$pokeId | cut -d ' ' -f 1)
# Fight Stats
# hp | attack | defense | special attack | special defense | speed
echo $(cat html-base/$pokeId | grep 'data-value' | sed -E 's/.*([[:digit:]]).*/\1/') >> data/$pokeId
# Image
cat html-base/$pokeId | grep -E "(img).*https://assets.pokemon.com/assets/cms2/img/pokedex/full.*png" | cut -d'"' -f 4 >> data/$pokeId
# Carecteristics
## Height
## Weight
## Category
## Abilities
cat html-base/$pokeId | grep -E '<span class="attribute-value">.*<' | sed -E 's/ *<span class="attribute-value">([^<]+)<\/span>/\1/' >> data/$pokeId
# Types
echo $(cat html-base/$pokeId | grep '<a href="/us/pokedex/?type=.*">.*</a>' | sed -E 's/^[^<]*<a href="\/us\/pokedex\/\?type=[[:alpha:]]+">([^<]+)<\/a>/\1/g') >> data/$pokeId
# Description
for i in $(seq 1 $nbline)
do
line=$(head -n $i html-base/$pokeId | tail -n 1)
if [[ ! -z $(echo $line | grep '<p class="version-[yx]') ]]
then
((i=i+2))
line=$(head -n $i html-base/$pokeId | tail -n 1)
echo $line >> data/$pokeId
fi
done
else
# Pokemons Specials - Multi-forms
nb_forms=$(cat html-base/$pokeId | grep -E "(img).*https://assets.pokemon.com/assets/cms2/img/pokedex/full.*png" | wc -l)
echo data/$pokeId have $nb_forms forms
# Names
cat html-base/$pokeId | grep '<img src="https://assets.pokemon.com/assets/cms2/img/pokedex/full/' | sed -E 's/^.*alt="([^"]+).*$/\1/g' >> data/$pokeId
decalage=0 # Hot fix
for num_form in $(seq 1 $nb_forms)
do
# Fight Stats
# hp | attack | defense | special attack | special defense | speed
echo $(cat html-base/$pokeId | grep 'data-value' | sed -E 's/.*([[:digit:]]).*/\1/' | head -n $((5*num_form)) | tail -n 5) >> data/$pokeId
# Image
cat html-base/$pokeId | grep -E "(img).*https://assets.pokemon.com/assets/cms2/img/pokedex/full.*png" | sed -E 's/^.*src="([^"]+).*/\1/g' | head -n $num_form | tail -n 1 >> data/$pokeId
# Carecteristics
## Height
## Weight
## Category
cat html-base/$pokeId | grep -E '<span class="attribute-value">.*<' | sed -E 's/ *<span class="attribute-value">([^<]+)<\/span>/\1/'| head -n $((4*num_form+decalage)) | tail -n 4 | head -n 3 >> data/$pokeId
## Abilities
nblines=$(cat html-base/$pokeId | wc -l)
i_form=1 # count form in loop
for i in $(seq 1 $nblines)
do
line=$(head -n $i html-base/$pokeId | tail -n 1)
if [ ! -z "$(echo $line | grep '<span class="attribute-title">Abilities</span>' )" ]
then
if [ $i_form -eq $num_form ]
then
while [ -z "$(echo $line | grep '</ul>' )" ]
do
if [ ! -z "$(echo $line | grep '<span class="attribute-value">')" ]
then
echo $line | sed -E "s/^<span.*>(.*)<\/span>$/\1/g" >> data/$pokeId
((decalage=decalage+1))
fi
((i=i+1))
line=$(head -n $i html-base/$pokeId | tail -n 1)
done
((decalage=decalage-1))
fi
((i_form=i_form+1))
fi
done
done
# Description
for i in $(seq 1 $nbline)
do
line=$(head -n $i html-base/$pokeId | tail -n 1)
if [[ ! -z $(echo $line | grep '<p class="version-[yx]') ]]
then
((i=i+2))
line=$(head -n $i html-base/$pokeId | tail -n 1)
echo $line >> data/$pokeId
fi
done
# Types
get_type $pokeId >> data/$pokeId
fi
# Evolution
# If there isn't evolution, we put the pokemon's name
if [ -z "$(cat html-base/$pokeId | grep -E '<a href="/us/pokedex/[[:alpha:]]*">' | sed -E 's/<a href="\/us\/pokedex\/([[:alpha:]]*)">$/\1/g')" ]
then
head -n $pokeId data.txt | tail -n 1 | cut -d' ' -f2 >> data/$pokeId
else
cat html-base/$pokeId | grep -E '<a href="/us/pokedex/[[:alpha:]][^"]*">' | sed -E 's/<a href="\/us\/pokedex\/([[:alpha:]][^"]*)">$/\1/g' >> data/$pokeId
fi
fi
done
}
get_img(){
for pokeId in $(seq 1 $(ls -w 1 html-base | wc -l))
do
for url in $(cat data/$pokeId | grep '.*\.png')
do
if [ ! -f img/$(basename $url) ]
then
echo create img/$(basename $url)
wget $url -O img/$(basename $url) -nv
fi
done
done
}
### MAIN ###
if [ -d html-base ]
then
echo html-base exists
else
echo create html-base
mkdir html-base
get_html
fi
get_html
if [ -d data ]
then
echo html-base exists
else
mkdir data
fi
get_data
if [ -d img ]
then
echo img exists
else
mkdir img
fi
get_img
# echo 'python3 send-datas.py'
# python3 send-data.py